In [91]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import VotingClassifier

##########
## SST2 ##
##########

class SentAnalyzer():
    def __init__(self, sourcePath):
        self.data = pd.read_csv(sourcePath)
        self.indexes = self.data['index']
        self.labels = self.data['label']
        self.texts = self.data['text']
        self.textsL1 = self.data.loc[self.data['label'] == 1]["text"]
        self.textsL0 = self.data.loc[self.data['label'] == 0]["text"]
    
    def checkBalance(self):
        len1 = len(self.textsL1)
        len0 = len(self.textsL0)
        print('Label 1:',len1)
        print('Label 0:',len0)
        if len1 > len0:
            print('Their ratio:',round(abs(len1/len0),2))
        else:
            print('Their ratio:',round(abs(len0/len1),2))

    def nGrams(self, data, minN,maxN,n):
        for i in range(minN,maxN):
            cv = CountVectorizer(ngram_range = (i,i))
            word_count_vector = cv.fit_transform(data)
            # print(word_count_vector)
            
            # Fit the model into the data
            tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True) 
            tfidf_transformer.fit(word_count_vector)
            
            df_idf = pd.DataFrame(tfidf_transformer.idf_, index = cv.get_feature_names(),columns = ["tfidf"])
            
            # sort ascending 
            print(df_idf.sort_values(by = ['tfidf'])[:n])
    
    def lengthCorrelation(self):
        lengths = [(lambda x: len(x[1]))(x) for x in self.texts.items()]
        lenLab = pd.DataFrame( list(zip(lengths, self.labels)), columns = ["length","labels"])
        print(lenLab.corr())

    def textTfidfValues(self,data):
        # this steps generates word counts for the words in your docs 
        cv = CountVectorizer()
        word_count_vector = cv.fit_transform(data)

        # print(word_count_vector)
        
        tfidf_transformer = TfidfTransformer(smooth_idf = True, use_idf = True)
        tfidf_transformer.fit(word_count_vector)
        
        self.tfidf = pd.DataFrame(tfidf_transformer.idf_, index = cv.get_feature_names(), columns = ["tfidf"])
        
        # sort ascending 
        # print(self.tfidf_transformer.idf_)
        return self.tfidf.sort_values(by = ["tfidf"])

In [92]:
# Load the SST data
train = SentAnalyzer("stsa.binary.phrases.train")

In [93]:
n = 5
print("The", n ,"most common word overall and within the two label.")
print("Overall:")
print(train.textTfidfValues(train.texts)[:n])
print("Label 0:")
print(train.textTfidfValues(train.textsL0).iloc[:n])
print("Label 1:")
print(train.textTfidfValues(train.textsL1).iloc[:n])

The 5 most common word overall and within the two label.
Overall:
        tfidf
the  2.210299
and  2.367068
of   2.490469
to   2.807791
is   3.125087
Label 0:
        tfidf
the  2.188806
and  2.463642
of   2.493857
to   2.670419
is   3.045642
Label 1:
        tfidf
the  2.228244
and  2.294130
of   2.487614
to   2.936536
is   3.195215


In [94]:
# Get the n-grams
print("Whole database")
train.nGrams(train.texts, 1, 4, 5)

Whole database
        tfidf
the  2.210299
and  2.367068
of   2.490469
to   2.807791
is   3.125087
             tfidf
of the    4.053253
in the    4.658701
the film  4.880207
to the    5.155174
to be     5.196617
                 tfidf
one of the    6.096209
the film is   6.918348
the kind of   7.009320
the movie is  7.086281
of the year   7.103573


In [95]:
# Get the n-grams of Label 1
print("Label 1 sentences")
train.nGrams(train.textsL1, 1, 4, 5)

Label 1 sentences
        tfidf
the  2.228244
and  2.294130
of   2.487614
to   2.936536
is   3.195215
             tfidf
of the    3.997627
in the    4.694099
the film  4.920578
to the    5.166961
and the   5.273170
                tfidf
one of the   5.771063
the film is  6.724343
of the year  6.807409
of the most  6.906664
of the best  6.933097


In [96]:
# Get the n-grams of Label 0
print("Label 0 sentences")
train.nGrams(train.textsL0, 1, 4, 5)

Label 0 sentences
        tfidf
the  2.188806
and  2.463642
of   2.493857
to   2.670419
is   3.045642
              tfidf
of the     4.124832
in the     4.616176
the film   4.831845
to be      5.060991
the movie  5.115987
                 tfidf
the movie is  6.610394
one of the    6.718383
of the film   6.900705
the kind of   7.097873
of its own    7.137093


In [97]:
train.lengthCorrelation()

          length    labels
length  1.000000 -0.037769
labels -0.037769  1.000000


In [98]:
import os
import json
import pandas as pd
import gzip
from urllib.request import urlopen

###################
## Amazon Review ##
###################

# Load in the Amazon Review Data with 5-core
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('Software_5.json.gz')
df = df.fillna('')
print('shape', df.shape)
i = 0
for (columnName, columnData) in df.iteritems():
    i+=1
    if i > 5:
        break
    print('Colunm Name : ', columnName)
    print('Column Contents : ', columnData.values[:5])

shape (12805, 12)
Colunm Name :  overall
Column Contents :  [4. 4. 5. 5. 5.]
Colunm Name :  verified
Column Contents :  [False False False False False]
Colunm Name :  reviewTime
Column Contents :  ['10 20, 2010' '10 18, 2010' '10 16, 2010' '10 12, 2010' '10 7, 2010']
Colunm Name :  reviewerID
Column Contents :  ['A38NELQT98S4H8' 'A3QJU4FEN8PQSZ' 'ACJT8MUC0LRF0' 'AYUF7YETYOLNX'
 'A31ICLWQ9CSHRS']
Colunm Name :  asin
Column Contents :  ['0321719816' '0321719816' '0321719816' '0321719816' '0321719816']


In [99]:
# pip install krippendorff

In [100]:
# Crowdsourcing Exercise
import krippendorff
import pandas as pd
import numpy as np

# importing libraries
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

class CrowdSourcer():
    
    def __init__(self, sourcePath):
        # Get data
        self.data = pd.read_csv(sourcePath)

        # The feature phrases removed (To separate rating)
        cols = []
        for i in range(29):
            cols.append("feature"+str(i+1))

        
        self.ratings = self.data.drop(["Sentence Index","Ground Truth Labels"] + cols,axis=1)
        self.groundTruth = self.data["Ground Truth Labels"]
        self.workers = self.ratings.columns
        # features = data.drop(["Sentence Index","Ground Truth Labels"] + workers.values[:], axis = 1)
        # print(self.ratings)

    def calcKrippendorff(self):
        missing = ""        
        kripCoef = krippendorff.alpha(reliability_data = self.ratings)
        print("Krippendorff coeficient: %.3f" % kripCoef)

    # Generate the prediction matrix out of the global sheet data.
    def trainByCS(self, X_train, y_train,voteType = "hard", solver = "lbfgs"):
        # group / ensemble of models
        self.estimator = []
        self.estimator.append(("LR", 
                        LogisticRegression(solver = solver, 
                                            multi_class ="multinomial", 
                                            max_iter = 200)))
        self.estimator.append(("SVC", SVC(gamma = "auto", probability = True)))
        self.estimator.append(("DTC", DecisionTreeClassifier()))

        # Voting Classifier with hard voting
        self.voters = VotingClassifier(estimators = self.estimator, voting = voteType)
        self.voters.fit(X_train, y_train)
        

    def checkPrediction(self,ratings,truths):
        y_pred = self.voters.predict(ratings)
        # print(whole)
        misses = 0
        for guess,truth in zip(y_pred, truths):
            # print(guess,truth)
            if guess != truth:
                misses += 1
        print(misses, "miss(es) out of",len(truths), " accurary:", str(100*round( 1-(misses/len(truths)) , 2)) + "%" )
        return str(100*round( 1-(misses/len(truths)) , 2))
        

collabData = CrowdSourcer("GlobalSheet.csv")

In [101]:
collabData.calcKrippendorff()

Krippendorff coeficient: 0.005


In [102]:
# Training and testing the crowdsourcing
testSize = 0.91

print("size of train data:", round( len(collabData.groundTruth)*(1-testSize)) )
print("size of test data:",  round( len(collabData.groundTruth)*testSize,0) )

#### TESTING PARAMETERS ####
# solvers = ["newton-cg","lbfgs"]
# for s in solvers:
#     print(s)
#     for j in range(2):
#         vote = "hard"
#         if j == 1:
#             vote = "soft"
#         print("with", vote,"vote")
#         for i in range(20):
#             X_train, X_test, y_train, y_test = train_test_split(collabData.ratings,
#                                                                 collabData.groundTruth,
#                                                                 test_size = testSize,
#                                                                 random_state = 42)

#             # collabData.trainByCS(X_train,y_train,"soft")
#             collabData.trainByCS(X_train,y_train,vote,s)
#             collabData.checkPrediction(X_test,y_test)
#############################

for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(collabData.ratings,
                                                        collabData.groundTruth,
                                                        test_size = testSize,
                                                        random_state = 42)

    collabData.trainByCS(X_train,y_train,"hard")
    collabData.checkPrediction(X_test,y_test)

size of train data: 9
size of test data: 92.0
4 miss(es) out of 92  accurary: 96.0%
3 miss(es) out of 92  accurary: 97.0%
4 miss(es) out of 92  accurary: 96.0%
4 miss(es) out of 92  accurary: 96.0%
4 miss(es) out of 92  accurary: 96.0%


In [103]:
# pip install gensim

In [104]:
# SKLearn
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

class Recommender(SentAnalyzer):
    def __init__(self, sourcePath):
        SentAnalyzer.__init__(self, sourcePath)
        self.testSize = 0.5
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.texts,
                                                                                self.labels,
                                                                                test_size = testSize,
                                                                                random_state = 42)

    def getDocTermMtx(self):
        self.cv = CountVectorizer(ngram_range=(1, 3), lowercase = True)
        self.cv.fit(self.texts.values)
        self.wcVector = self.cv.transform(self.texts.values)

        self.vocab = list(self.cv.vocabulary_.items())
        print('Vocabulary len:',len(self.vocab))
        print("Word Vector matrix:",self.wcVector.shape)

        # Unigram Tf-Idf
        
        self.tfidf = TfidfTransformer()
        self.tfidf.fit(self.wcVector)

        # Numericalize the train dataset
        self.docTermMtx = self.tfidf.transform(self.wcVector)
        print('Document-term matrix',self.docTermMtx.shape)
        
    def dimensionReduce(self):
        svd = TruncatedSVD(n_components = 100, n_iter=7, random_state=42)
        svd.fit(self.docTermMtx)

        print(svd.explained_variance_ratio_)
        print()
        print(svd.explained_variance_ratio_.sum())
        print()
        print('Eigen values',svd.singular_values_)

    def predictWithModels(self):
        self.model = Word2Vec.load("word2vec.model")
        self.model.train(self.X_train, total_examples=1, epochs=1)
        # print("train score:", model.score(self.X_train, self.y_train))
        # print("test score:", model.score(self.X_test, self.y_test))

print("start")
sstRec = Recommender("stsa.binary.phrases.train")
# sstRec.predictWithModels()
print("done")

start
done


In [105]:
sstRec.getDocTermMtx()

Vocabulary len: 199518
Word Vector matrix: (76961, 199518)
Document-term matrix (76961, 199518)


In [106]:
sstRec.dimensionReduce()

[0.00121149 0.00128441 0.00117962 0.00099757 0.00091534 0.00089047
 0.00086159 0.00084731 0.00080133 0.00077393 0.00076205 0.00075188
 0.00071879 0.00068555 0.00066685 0.00064739 0.00064214 0.0006407
 0.00059597 0.00059384 0.00059141 0.0005794  0.00057164 0.00056456
 0.00055109 0.00054799 0.00054204 0.00052532 0.00052025 0.00051376
 0.0005109  0.00050428 0.00050088 0.00048616 0.00047748 0.00046825
 0.00046533 0.0004636  0.00045509 0.0004464  0.00043525 0.0004327
 0.00042803 0.00041918 0.00041732 0.00041331 0.00040727 0.00040469
 0.00040002 0.00039647 0.00039161 0.00038776 0.00038475 0.00037984
 0.00037849 0.00037695 0.00037333 0.00036999 0.00036589 0.00036348
 0.0003602  0.00035359 0.00035304 0.00035192 0.00034937 0.00034482
 0.00034234 0.00033849 0.00033823 0.00033695 0.00033581 0.0003336
 0.00033319 0.00033089 0.00032919 0.00032556 0.00032508 0.00032314
 0.00031941 0.00031857 0.00031762 0.00031692 0.0003126  0.00031172
 0.00030996 0.00030882 0.00030611 0.00030461 0.00030224 0.0002996

In [110]:
# Here I tried the differences between 
# the CountVectorizer with different parameters

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Unigram Counts

unigram_vectorizer = CountVectorizer(ngram_range=(1, 1))
unigram_vectorizer.fit(sstRec.texts.values)

X_train_unigram = unigram_vectorizer.transform(sstRec.texts.values)

# Unigram Tf-Idf
unigram_tf_idf = TfidfTransformer()
unigram_tf_idf.fit(X_train_unigram)

X_train_unigram_tf_idf = unigram_tf_idf.transform(X_train_unigram)

# trigram Counts
trigram_vectorizer = CountVectorizer(ngram_range=(1, 3))
trigram_vectorizer.fit(sstRec.texts.values)

X_train_trigram = trigram_vectorizer.transform(sstRec.texts.values)


# trigram Tf-Idf
trigram_tf_idf = TfidfTransformer()
trigram_tf_idf.fit(X_train_trigram)

X_train_trigram_tf_idf = trigram_tf_idf.transform(X_train_trigram)

In [112]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
import numpy as np

def train_and_show_scores(X, y, title) -> None:
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, train_size = 0.75, stratify = y
    )

    clf = SGDClassifier()
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    valid_score = clf.score(X_valid, y_valid)
    print(f'{title}\nTrain score: {round(train_score, 2)} ; Validation score: {round(valid_score, 2)}\n')

y_train = sstRec.labels.values

train_and_show_scores(X_train_unigram, y_train, 'Unigram Counts')
train_and_show_scores(X_train_unigram_tf_idf, y_train, 'Unigram Tf-Idf')
train_and_show_scores(X_train_trigram, y_train, 'Trigram Counts')
train_and_show_scores(X_train_trigram_tf_idf, y_train, 'Trigram Tf-Idf')

train_and_show_scores(X_train_unigram, y_train, 'Unigram Counts')
train_and_show_scores(X_train_unigram_tf_idf, y_train, 'Unigram Tf-Idf')
train_and_show_scores(X_train_trigram, y_train, 'Trigram Counts')
train_and_show_scores(X_train_trigram_tf_idf, y_train, 'Trigram Tf-Idf')

Unigram Counts
Train score: 0.92 ; Validation score: 0.88

Unigram Tf-Idf
Train score: 0.9 ; Validation score: 0.87

Trigram Counts
Train score: 0.97 ; Validation score: 0.91

Trigram Tf-Idf
Train score: 0.93 ; Validation score: 0.88

Unigram Counts
Train score: 0.92 ; Validation score: 0.88

Unigram Tf-Idf
Train score: 0.9 ; Validation score: 0.87

Trigram Counts
Train score: 0.97 ; Validation score: 0.91

Trigram Tf-Idf
Train score: 0.93 ; Validation score: 0.88

