In [209]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import VotingClassifier

import re
import string
from nltk.corpus import stopwords

##########
## SST2 ##
##########

class SentAnalyzer():
    def __init__(self, sourcePath):
        self.data = pd.read_csv(sourcePath)
        
        # self.indexes = self.data['index']
        self.labels = self.data['label']
        
        # Do we wanna clean the data?
        self.cleanData = True

        # self.texts = self.data['text']
        self.texts = self.data["text"]
        self.textsL1 = self.data.loc[self.data['label'] == 1]["text"]
        self.textsL0 = self.data.loc[self.data['label'] == 0]["text"]

        self.stop_words = stopwords.words('english')
        self.regexStopWords = ""
        self.conj = "( |\W|^[a-zA-Z0-9])"
        for w in self.stop_words:
            self.regexStopWords += self.conj + w + self.conj + "|"
        self.regexStopWords = self.regexStopWords[:len(self.regexStopWords)-1]

    def cleanString(self,text):
        x = " " + text.lower()
        x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
        x = re.sub(regexStopWords, ' ', x)
        return x

    def checkBalance(self):
        len1 = len(self.textsL1)
        len0 = len(self.textsL0)
        print('Label 1:',len1)
        print('Label 0:',len0)
        if len1 > len0:
            print('Their ratio:',round(abs(len1/len0),2))
        else:
            print('Their ratio:',round(abs(len0/len1),2))

    def nGrams(self, data, minN,maxN,n):
        for i in range(minN,maxN):
            cv = CountVectorizer(ngram_range = (i,i))
            if self.cleanData:
                cv = CountVectorizer(ngram_range = (i,i),preprocessor=self.cleanString)

            word_count_vector = cv.fit_transform(data)
            # print(word_count_vector)
            
            # Fit the model into the data
            tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True) 
            tfidf_transformer.fit(word_count_vector)
            
            df_idf = pd.DataFrame(tfidf_transformer.idf_, index = cv.get_feature_names(),columns = ["tfidf"])
            
            # sort ascending 
            print(df_idf.sort_values(by = ['tfidf'])[:n])
    
    def lengthCorrelation(self):
        lengths = [(lambda x: len(x[1]))(x) for x in self.texts.items()]
        lenLab = pd.DataFrame( list(zip(lengths, self.labels)), columns = ["length","labels"])
        print(lenLab.corr())

    def textTfidfValues(self,data):
        # this steps generates word counts for the words in your docs 
        cv = CountVectorizer()
        if self.cleanData:
            cv = CountVectorizer(preprocessor = self.cleanString)
        
        word_count_vector = cv.fit_transform(data)

        # print(word_count_vector)
        
        tfidf_transformer = TfidfTransformer(smooth_idf = True, use_idf = True)
        tfidf_transformer.fit(word_count_vector)
        
        self.tfidf = pd.DataFrame(tfidf_transformer.idf_, index = cv.get_feature_names(), columns = ["tfidf"])
        
        # sort ascending 
        # print(self.tfidf_transformer.idf_)
        return self.tfidf.sort_values(by = ["tfidf"])

In [207]:
# Load the SST data
train = SentAnalyzer("stsa.binary.phrases.train")

In [332]:
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.corpus import stopwords 

text = "unless bob crane is someone of particular interest to you , this film 's impressive performances and adept direction are n't likely to leave a lasting impression ."
stopWords = set(stopwords.words('english')) 
words = word_tokenize(text)
for w in words: 
    if w not in stopWords: 
        wordsFiltered.append(w) 

In [208]:
n = 20
print("The", n ,"most common word overall and within the two label.")
print("Overall:")
print(train.textTfidfValues(train.texts[5:])[:n])
# print("Label 0:")
# print(train.textTfidfValues(train.textsL0).iloc[:n])
# print("Label 1:")
# print(train.textTfidfValues(train.textsL1).iloc[:n])

The 20 most common word overall and within the two label.
Overall:
               tfidf
the         2.796291
film        3.829284
movie       3.927944
it          4.380901
one         4.477473
its         4.503470
like        4.645675
and         4.656686
an          4.694639
is          4.796347
good        4.818583
be          4.835290
story       4.875811
this        4.945879
rrb         5.070997
funny       5.075577
characters  5.097233
comedy      5.126589
to          5.133051
lrb         5.155174
Label 0:
               tfidf
the         2.761252
movie       3.741244
film        3.978675
it          4.189152
like        4.311754
its         4.453336
one         4.496133
be          4.645542
is          4.672389
and         4.686088
bad         4.711701
not         4.731952
this        4.746497
much        4.893551
story       4.904931
an          4.907796
characters  5.007276
rrb         5.018431
little      5.057652
of          5.072765
Label 1:
           tfidf
the     2.825890

In [129]:
# Get the n-grams
print("Whole database")
train.nGrams(train.texts, 1, 4, 5)

Whole database
        tfidf
the  2.210299
and  2.367068
of   2.490469
to   2.807791
is   3.125087
             tfidf
of the    4.053253
in the    4.658701
the film  4.880207
to the    5.155174
to be     5.196617
                 tfidf
one of the    6.096209
the film is   6.918348
the kind of   7.009320
the movie is  7.086281
of the year   7.103573


In [130]:
# Get the n-grams of Label 1
print("Label 1 sentences")
train.nGrams(train.textsL1, 1, 4, 5)

Label 1 sentences
        tfidf
the  2.228244
and  2.294130
of   2.487614
to   2.936536
is   3.195215
             tfidf
of the    3.997627
in the    4.694099
the film  4.920578
to the    5.166961
and the   5.273170
                tfidf
one of the   5.771063
the film is  6.724343
of the year  6.807409
of the most  6.906664
of the best  6.933097


In [131]:
# Get the n-grams of Label 0
print("Label 0 sentences")
train.nGrams(train.textsL0, 1, 4, 5)

Label 0 sentences
        tfidf
the  2.188806
and  2.463642
of   2.493857
to   2.670419
is   3.045642
              tfidf
of the     4.124832
in the     4.616176
the film   4.831845
to be      5.060991
the movie  5.115987
                 tfidf
the movie is  6.610394
one of the    6.718383
of the film   6.900705
the kind of   7.097873
of its own    7.137093


In [211]:
train.lengthCorrelation()

          length    labels
length  1.000000 -0.037769
labels -0.037769  1.000000


In [249]:
import os
import json
import pandas as pd
import gzip
from urllib.request import urlopen

###################
## Amazon Review ##
###################

# Load in the Amazon Review Data with 5-core
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('Software_5.json.gz')
df = df.fillna('')
# print('shape', df.shape)

products = df["asin"].drop_duplicates()
reviewers = df["reviewerID"].drop_duplicates()
# print('Products\n',products)
# print('Reviewers\n',reviewers)

# df.loc[df['reviewerID'] == ]

In [250]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,4.0,False,"10 20, 2010",A38NELQT98S4H8,321719816,{'Format:': ' DVD-ROM'},WB Halper,I've been using Dreamweaver (and it's predeces...,A solid overview of Dreamweaver CS5,1287532800,,
1,4.0,False,"10 18, 2010",A3QJU4FEN8PQSZ,321719816,{'Format:': ' DVD-ROM'},Grimmy,"The demo is done with the PC version, with ref...",A good value,1287360000,,
2,5.0,False,"10 16, 2010",ACJT8MUC0LRF0,321719816,{'Format:': ' DVD-ROM'},D. Fowler,If you've been wanting to learn how to create ...,This is excellent software for those who want ...,1287187200,3.0,
3,5.0,False,"10 12, 2010",AYUF7YETYOLNX,321719816,{'Format:': ' DVD-ROM'},Bryan Newman,I've been creating websites with Dreamweaver f...,A Fantastic Overview of Dream Weaver and Web D...,1286841600,,
4,5.0,False,"10 7, 2010",A31ICLWQ9CSHRS,321719816,{'Format:': ' DVD-ROM'},Al Swanson,I decided (after trying a number of other prod...,Excellent Tutorials!,1286409600,,


In [337]:
df.loc[df["asin"] == '0321719816'].head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,4.0,False,"10 20, 2010",A38NELQT98S4H8,321719816,{'Format:': ' DVD-ROM'},WB Halper,I've been using Dreamweaver (and it's predeces...,A solid overview of Dreamweaver CS5,1287532800,,
1,4.0,False,"10 18, 2010",A3QJU4FEN8PQSZ,321719816,{'Format:': ' DVD-ROM'},Grimmy,"The demo is done with the PC version, with ref...",A good value,1287360000,,
2,5.0,False,"10 16, 2010",ACJT8MUC0LRF0,321719816,{'Format:': ' DVD-ROM'},D. Fowler,If you've been wanting to learn how to create ...,This is excellent software for those who want ...,1287187200,3.0,
3,5.0,False,"10 12, 2010",AYUF7YETYOLNX,321719816,{'Format:': ' DVD-ROM'},Bryan Newman,I've been creating websites with Dreamweaver f...,A Fantastic Overview of Dream Weaver and Web D...,1286841600,,
4,5.0,False,"10 7, 2010",A31ICLWQ9CSHRS,321719816,{'Format:': ' DVD-ROM'},Al Swanson,I decided (after trying a number of other prod...,Excellent Tutorials!,1286409600,,


In [392]:
import math 

# i = 0
# userOwnReviews = []
# for person in reviewers.values:
#     i += 1
#     # print(person)
#     # itemsReviewed = df.loc[df["reviewerID"] == person][['asin','overall','reviewerName']]
#     itemsReviewed = df.loc[df["reviewerID"] == person][['asin','overall']]
#     print(itemsReviewed.head())
#     if i > 5:
#         break

def simularityScore(person1,person2):
    # Movies and scores
    u1Reviews = df.loc[df["reviewerID"] == person1][['asin','overall']]
    u2Reviews = df.loc[df["reviewerID"] == person2][['asin','overall']]
    
    eDiff = 0
    # looping through each movie
    for movie in u1Reviews.values:
        # print(movie[0])
        # print(u2Reviews['asin'].values)
        # if there is a match, then calculating the square difference
        if movie[0] in u2Reviews['asin'].values:
            print("in")
            u1score = u1Reviews.loc[u1Reviews["asin"] == movie[0]]['overall'].values[0]
            print(u1score)
            u2score = u2Reviews.loc[u2Reviews["asin"] == movie[0]]['overall'].values[0]
            print(u2score)
            diff = u1score - u2score
            eDiff += diff**2

    # returning it
    return math.sqrt(eDiff)

# for person1 in reviewers.values:
#     for person2 in reviewers.values:
#         if person1 != person2:
#             print("person1:", person1, "person2:", person2, "score:",simularityScore(person1,person2))
person1 = "A38NELQT98S4H8"
person2 = "A3QJU4FEN8PQSZ"
# u1Reviews = df.loc[df["reviewerID"] == person1][['asin','overall']]
# u2Reviews = df.loc[df["reviewerID"] == person2][['asin','overall']]
# if "0321719816" in u1Reviews['asin'].values:
#     print("done",u2Reviews[u2Reviews['asin'] == "032171981"]['overall'].values[0])
# else:
#     print("nope",u1Reviews['asin'].values)
print("person1:", person1, "person2:", person2, "score:", simularityScore(person1,person2))

# df.loc[df["reviewerID"] == 'A38NELQT98S4H8'][['asin','overall']].head()
# df.loc[df["reviewerID"] == 'A3QJU4FEN8PQSZ'][['asin','overall']].head()

in
4.0
4.0
person1: A38NELQT98S4H8 person2: A3QJU4FEN8PQSZ score: 0.0


4.0

In [222]:
df.loc[df['reviewerID'] == 'A38NELQT98S4H8']

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,4.0,False,"10 20, 2010",A38NELQT98S4H8,0321719816,{'Format:': ' DVD-ROM'},WB Halper,I've been using Dreamweaver (and it's predeces...,A solid overview of Dreamweaver CS5,1287532800,,
2201,4.0,False,"10 14, 2009",A38NELQT98S4H8,B000XHE0Q0,,WB Halper,...that hasn't already been said?\n\nIf you're...,What can you say about Microsoft Office...,1255478400,,
3015,4.0,False,"10 14, 2009",A38NELQT98S4H8,B000XHE0Q0,,WB Halper,...that hasn't already been said?\n\nIf you're...,What can you say about Microsoft Office...,1255478400,,
4705,1.0,False,"06 24, 2010",A38NELQT98S4H8,B003HCA7GA,,WB Halper,"Final Update - On February 1, 2011, I went to ...",Not quite ready for prime time...and now it ne...,1277337600,47.0,
7575,5.0,False,"10 17, 2013",A38NELQT98S4H8,B00DT04I9W,{'Platform:': ' PC/Mac Disc'},WB Halper,I've been playing with painter since it came o...,Flippin' Phenominal,1381968000,2.0,
8360,5.0,False,"12 31, 2013",A38NELQT98S4H8,B00F2BA8U4,,WB Halper,I have been a user of Lightroom for several ye...,A worthwhile competitor for Adobe's Lightroom,1388448000,11.0,
9248,4.0,False,"08 3, 2014",A38NELQT98S4H8,B00JZNHU88,{'Format:': ' Misc. Supplies'},WB Halper,This is basically Norton Internet Security wit...,Easy to install on mutiple computers....,1407024000,,
10330,4.0,False,"12 29, 2014",A38NELQT98S4H8,B00OW2PHDM,{'Platform:': ' PC'},WB Halper,I have many old 8mm video tapes that were made...,Using it to transcribe old videos...,1419811200,5.0,
11624,4.0,False,"05 16, 2016",A38NELQT98S4H8,B015PIIGQ6,{'Format:': ' Software'},WB Halper,I've been using this to convert old family vid...,Works beautifully to convert and clean up old ...,1463356800,,


In [225]:
df.loc[df['asin'] == '0321719816']

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,4.0,False,"10 20, 2010",A38NELQT98S4H8,321719816,{'Format:': ' DVD-ROM'},WB Halper,I've been using Dreamweaver (and it's predeces...,A solid overview of Dreamweaver CS5,1287532800,,
1,4.0,False,"10 18, 2010",A3QJU4FEN8PQSZ,321719816,{'Format:': ' DVD-ROM'},Grimmy,"The demo is done with the PC version, with ref...",A good value,1287360000,,
2,5.0,False,"10 16, 2010",ACJT8MUC0LRF0,321719816,{'Format:': ' DVD-ROM'},D. Fowler,If you've been wanting to learn how to create ...,This is excellent software for those who want ...,1287187200,3.0,
3,5.0,False,"10 12, 2010",AYUF7YETYOLNX,321719816,{'Format:': ' DVD-ROM'},Bryan Newman,I've been creating websites with Dreamweaver f...,A Fantastic Overview of Dream Weaver and Web D...,1286841600,,
4,5.0,False,"10 7, 2010",A31ICLWQ9CSHRS,321719816,{'Format:': ' DVD-ROM'},Al Swanson,I decided (after trying a number of other prod...,Excellent Tutorials!,1286409600,,
5,5.0,False,"09 26, 2010",A2BVNVJOFXGZUB,321719816,{'Format:': ' DVD-ROM'},J. Howard,The video is well-paced and delivered in an un...,Excellent.,1285459200,,
6,5.0,False,"04 7, 2011",A2JMJVNTBL7K7E,321719816,{'Format:': ' DVD-ROM'},Yesuaini99,I spent several hours on the lesson and I love...,excellent video training material,1302134400,,
7,5.0,False,"01 8, 2011",A14JBDSWKPKTZA,321719816,{'Format:': ' DVD-ROM'},Bob Feeser,I have had Dreamweaver MX2004 since it came ou...,Great Video for a Difficult at Times Program,1294444800,,
8,5.0,False,"10 28, 2010",A2WCFDOCS73PNB,321719816,{'Format:': ' DVD-ROM'},Gadgetman,I have also taken a local community college on...,Excellent value for the price,1288224000,4.0,
9,5.0,False,"10 28, 2010",A14638TGYH7GD9,321719816,{'Format:': ' DVD-ROM'},Rebecca Haden,Even though I use Dreamweaver a great deal and...,Buy this with your copy of Dreamweaver,1288224000,6.0,


In [215]:
df.describe()

Unnamed: 0,overall,unixReviewTime
count,12805.0,12805.0
mean,3.87786,1350001000.0
std,1.362086,101756900.0
min,1.0,961977600.0
25%,3.0,1266365000.0
50%,4.0,1371686000.0
75%,5.0,1427328000.0
max,5.0,1535242000.0


In [134]:
# pip install krippendorff

In [263]:
# Crowdsourcing Exercise
import krippendorff
import pandas as pd
import numpy as np

# importing libraries
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

class CrowdSourcer():
    
    def __init__(self, sourcePath):
        # Get data
        self.data = pd.read_csv(sourcePath)

        # The feature phrases removed (To separate rating)
        cols = []
        for i in range(29):
            cols.append("feature" + str(i + 1))

        
        self.ratings = self.data.drop(["Sentence Index","Ground Truth Labels"] + cols, axis = 1)
        self.groundTruth = self.data["Ground Truth Labels"]
        self.workers = self.ratings.columns
        # features = data.drop(["Sentence Index","Ground Truth Labels"] + workers.values[:], axis = 1)
        # print(self.ratings)

    def calcKrippendorff(self):
        missing = ""        
        kripCoef = krippendorff.alpha(reliability_data = self.ratings)
        print("Krippendorff coeficient: %.3f" % kripCoef)

    # Generate the prediction matrix out of the global sheet data.
    def trainByCS(self, X_train, y_train,voteType = "hard", solver = "lbfgs"):
        # group / ensemble of models
        self.estimator = []
        self.estimator.append(("LR", 
                        LogisticRegression(solver = solver, 
                                            multi_class ="multinomial", 
                                            max_iter = 200)))
        self.estimator.append(("SVC", SVC(gamma = "auto", probability = True)))
        self.estimator.append(("DTC", DecisionTreeClassifier()))

        # Voting Classifier with hard voting
        self.voters = VotingClassifier(estimators = self.estimator, voting = voteType)
        self.voters.fit(X_train, y_train)
        

    def checkPrediction(self,ratings,truths):
        y_pred = self.voters.predict(ratings)
        # print(whole)
        misses = 0
        for guess,truth in zip(y_pred, truths):
            # print(guess,truth)
            if guess != truth:
                misses += 1
        print(misses, "miss(es) out of",len(truths), " accurary:", str(100*round( 1-(misses/len(truths)) , 2)) + "%" )
        return str(100*round( 1-(misses/len(truths)) , 2))
        
collabData = CrowdSourcer("GlobalSheet.csv")

In [136]:
collabData.calcKrippendorff()

Krippendorff coeficient: 0.005


In [137]:
# Training and testing the crowdsourcing
testSize = 0.91

print("size of train data:", round( len(collabData.groundTruth)*(1-testSize)) )
print("size of test data:",  round( len(collabData.groundTruth)*testSize,0) )

#### TESTING PARAMETERS ####
# solvers = ["newton-cg","lbfgs"]
# for s in solvers:
#     print(s)
#     for j in range(2):
#         vote = "hard"
#         if j == 1:
#             vote = "soft"
#         print("with", vote,"vote")
#         for i in range(20):
#             X_train, X_test, y_train, y_test = train_test_split(collabData.ratings,
#                                                                 collabData.groundTruth,
#                                                                 test_size = testSize,
#                                                                 random_state = 42)

#             # collabData.trainByCS(X_train,y_train,"soft")
#             collabData.trainByCS(X_train,y_train,vote,s)
#             collabData.checkPrediction(X_test,y_test)
#############################

for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(collabData.ratings,
                                                        collabData.groundTruth,
                                                        test_size = testSize,
                                                        random_state = 42)

    collabData.trainByCS(X_train,y_train,"hard")
    collabData.checkPrediction(X_test,y_test)

size of train data: 9
size of test data: 92.0
3 miss(es) out of 92  accurary: 97.0%
3 miss(es) out of 92  accurary: 97.0%
4 miss(es) out of 92  accurary: 96.0%
4 miss(es) out of 92  accurary: 96.0%
3 miss(es) out of 92  accurary: 97.0%


In [138]:
# pip install gensim

In [139]:
from sklearn.base import TransformerMixin, BaseEstimator

class Debug(BaseEstimator, TransformerMixin):

    def transform(self, X):
        # print(X.shape)
        self.shape = X.shape
        # print(self.shape)
        return X

    def fit(self, X, y=None, **fit_params):
        return self

In [314]:
import numpy as np

# SKLearn
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

class Recommender(SentAnalyzer):
    def __init__(self, sourcePath):
        SentAnalyzer.__init__(self, sourcePath)
        
        self.cleanData = True # (Overright)
        self.testSize = 0.5
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.texts,
                                                                                self.labels,
                                                                                test_size = self.testSize,
                                                                                random_state = 42)

    def getDocTermMtx(self,n_min = 1,n_max = 3):
        self.cv = CountVectorizer(ngram_range=(n_min, n_max), lowercase = True)
        if self.cleanData:
            self.cv = CountVectorizer(ngram_range=(n_min, n_max), lowercase = True, preprocessor = self.cleanString)

        self.cv.fit(self.texts.values)
        self.wcVector = self.cv.transform(self.texts.values)

        self.vocab = list(self.cv.vocabulary_.items())
        print('Vocabulary len:',len(self.vocab))
        print("Word Vector matrix:",self.wcVector.shape)

        # Unigram Tf-Idf
        
        self.tfidf = TfidfTransformer()
        self.tfidf.fit(self.wcVector)

        # Numericalize the train dataset
        self.docTermMtx = self.tfidf.transform(self.wcVector)
        print('Document-term matrix',self.docTermMtx.shape)
        
    def dimensionReduce(self,n):
        svd = TruncatedSVD(n_components = n, n_iter=7, random_state=42)
        svd.fit(self.docTermMtx)
        self.redDocTermMtx = svd.transform(self.docTermMtx)

        print(svd.explained_variance_ratio_)
        print()
        print(svd.explained_variance_ratio_.sum())
        print()
        print('Eigen values',svd.singular_values_)
        print()
        print('redDocTermMtx',self.redDocTermMtx)

    def linReg(self):
        self.lr = LogisticRegression(random_state=0)
        self.lrResult = self.lr.fit(self.X_train.values,self.y_train)

    def withPipeline(self, n_min = 1,n_max = 1, svd_iter = 7,predictor = "sgd", useDimRed = True, nDim = 100):
        start = time.time()

        # Pipeline components
        pipeArray = []
        if self.cleanData:
            self.cv = CountVectorizer(ngram_range=(n_min, n_max), lowercase = True, preprocessor = self.cleanString)
            pipeArray.append(("CV", self.cv))
        else:
            self.cv = CountVectorizer(ngram_range=(n_min, n_max), lowercase = True)
            pipeArray.append(("CV", self.cv))
        
        self.tfidf = TfidfTransformer()
        pipeArray.append(("TF-IDF", self.tfidf))
        
        # Print doc-term-mtx info
        # self.debug = Debug()
        # pipeArray.append(("Debug", self.debug))
        
        if useDimRed:
            self.svd = TruncatedSVD(n_components = nDim, n_iter = svd_iter, random_state = 42)
            pipeArray.append(("SVD", self.svd))
        
        # Predictor used
        if predictor == "lr":
            self.lr = LogisticRegression(random_state = 0)
            pipeArray.append(("LR", self.lr))
        elif predictor == "sgd":
            self.sgd = SGDClassifier(max_iter=1000, tol=1e-3)
            pipeArray.append(("SGD", self.sgd))
        
        # Process pipeline
        self.pipe = Pipeline(pipeArray)
        self.result = self.pipe.fit(self.X_train.values, self.y_train)
        end = time.time()
        runtime = round(end - start, 2)

        # Test
        if useDimRed:
            print('With dims:', nDim ,', tf-idf dims:', self.debug.shape , "using", predictor ,", runtime:",runtime)
        else:
            print('Without dim reduction, tf-idf dims:', self.debug.shape , "using", predictor ,", runtime:",runtime)
        self.test()

    def test(self, predicter = None):
        if predicter == None:
            predicter = self.pipe
        self.pred = predicter.predict(self.X_test.values)
        self.rightAns = 0
        for pred, ans in zip(self.pred,self.y_test.values):
            if pred == ans:
                self.rightAns += 1
        print('Right answers:',self.rightAns, 'of',len(self.pred),
                'so', str(round(100*self.rightAns/len(self.pred),2)) + "%" )
            
    def predictWithModels(self):
        self.model = Word2Vec.load("word2vec.model")
        self.model.train(self.X_train, total_examples=1, epochs=1)
        print("train score:", model.score(self.X_train, self.y_train))
        print("test score:", model.score(self.X_test, self.y_test))

In [315]:
sstRec = Recommender("stsa.binary.phrases.train")

In [276]:
# sstRec.getDocTermMtx()

Vocabulary len: 167532
Word Vector matrix: (76961, 167532)
Document-term matrix (76961, 167532)


In [142]:
# print(cleanData[:5].values)

[[' worst revenge the nerds clichés filmmakers could dredge up']
 ['solid acting a neat premise']
 ['ode unconditional love compassion garnered years seeing all   condition the old privy       often misconstrued weakness']
 ['a   do joan philip  repetitive arguments   schemes treachery']
 [' movie silly beyond comprehension   even it n silly   would still beyond comprehension  ']]


In [334]:
import nltk

nltk.download()
# sstRec.texts.values

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [316]:
sstRec.cleanData = False
sstRec.withPipeline()
# With dims: 20 , tf-idf dims: (38480, 14332) using LR, runtime 4.49
# Right answers: 25438 of 38481 so 66.11%

With dims: 20 , tf-idf dims: (38480, 14332) using LR, runtime 4.49
Right answers: 25438 of 38481 so 66.11%


In [324]:
sstRec.cleanData = False
sstRec.withPipeline(predictor="lr")
# With dims: 20 , tf-idf dims: (38480, 14332) using LR, runtime 5.23
# Right answers: 25960 of 38481 so 67.46%

With dims: 20 , tf-idf dims: (38480, 14332) using LR, runtime 5.23
Right answers: 25960 of 38481 so 67.46%


In [317]:
sstRec.cleanData = True
sstRec.withPipeline()
# With dims: 20 , tf-idf dims: (38480, 14328) using LR, runtime 55.0
# Right answers: 24349 of 38481 so 63.28%

With dims: 20 , tf-idf dims: (38480, 14328) using LR, runtime 55.0
Right answers: 24349 of 38481 so 63.28%


In [320]:
# SGD
sstRec.cleanData = True
sstRec.withPipeline(nDim=200)
# With dims: 200 , tf-idf dims: (38480, 14328) using LR, runtime 59.45
# Right answers: 26262 of 38481 so 68.25%

With dims: 20 , tf-idf dims: (38480, 14328) using LR, runtime 59.45
Right answers: 26262 of 38481 so 68.25%


In [323]:
# LR
sstRec.cleanData = True
sstRec.withPipeline(nDim=300,predictor="lr")
# With dims: 300 , tf-idf dims: (38480, 14328) using LR, runtime 63.17
# Right answers: 28227 of 38481 so 73.35%

With dims: 20 , tf-idf dims: (38480, 14328) using LR, runtime 63.17
Right answers: 28227 of 38481 so 73.35%


In [None]:
import time

start = time.time()
sstRec.withPipeline(n_min = 1, n_max = 1) # 65.1%
sstRec.withPipeline(n_min = 1, n_max = 2) # 64.15%
sstRec.withPipeline(n_min = 1, n_max = 3) # 63.79%
sstRec.withPipeline(n_min = 1, n_max = 4) # 63.71%
end = time.time()
print("runtime:",round(end-start,2),"msp")

In [None]:
# sstRec.

In [None]:
sstRec.getDocTermMtx()
sstRec.linReg()

In [None]:
sstRec.dimensionReduce(100)

In [None]:
# Here I tried the differences between 
# the CountVectorizer with different parameters

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Unigram Counts

unigram_vectorizer = CountVectorizer(ngram_range=(1, 1))
unigram_vectorizer.fit(sstRec.texts.values)

X_train_unigram = unigram_vectorizer.transform(sstRec.texts.values)

# Unigram Tf-Idf
unigram_tf_idf = TfidfTransformer()
unigram_tf_idf.fit(X_train_unigram)

X_train_unigram_tf_idf = unigram_tf_idf.transform(X_train_unigram)

# trigram Counts
trigram_vectorizer = CountVectorizer(ngram_range=(1, 3))
trigram_vectorizer.fit(sstRec.texts.values)

X_train_trigram = trigram_vectorizer.transform(sstRec.texts.values)


# trigram Tf-Idf
trigram_tf_idf = TfidfTransformer()
trigram_tf_idf.fit(X_train_trigram)

X_train_trigram_tf_idf = trigram_tf_idf.transform(X_train_trigram)

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
import numpy as np

def train_and_show_scores(X, y, title) -> None:
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, train_size = 0.75, stratify = y
    )

    clf = SGDClassifier()
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    valid_score = clf.score(X_valid, y_valid)
    print(f'{title}\nTrain score: {round(train_score, 2)} ; Validation score: {round(valid_score, 2)}\n')

y_train = sstRec.labels.values

train_and_show_scores(X_train_unigram, y_train, 'Unigram Counts')
train_and_show_scores(X_train_unigram_tf_idf, y_train, 'Unigram Tf-Idf')
train_and_show_scores(X_train_trigram, y_train, 'Trigram Counts')
train_and_show_scores(X_train_trigram_tf_idf, y_train, 'Trigram Tf-Idf')

train_and_show_scores(X_train_unigram, y_train, 'Unigram Counts')
train_and_show_scores(X_train_unigram_tf_idf, y_train, 'Unigram Tf-Idf')
train_and_show_scores(X_train_trigram, y_train, 'Trigram Counts')
train_and_show_scores(X_train_trigram_tf_idf, y_train, 'Trigram Tf-Idf')