# Stanford Data-set (SST) - Rotten Tomatoes reviews
## Analysis

In [93]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import VotingClassifier

import re
import string
from nltk.corpus import stopwords

class SentAnalyzer():
    def __init__(self, sourcePath):
        self.data = pd.read_csv(sourcePath)
        
        # self.indexes = self.data['index']
        self.labels = self.data['label']
        
        # Do we wanna clean the data?
        self.cleanData = True

        # self.texts = self.data['text']
        self.texts = self.data["text"]
        self.textsL1 = self.data.loc[self.data['label'] == 1]["text"]
        self.textsL0 = self.data.loc[self.data['label'] == 0]["text"]

        self.stop_words = stopwords.words('english')
        self.regexStopWords = ""
        self.ws = "( |\W)"
        for w in self.stop_words:
            self.regexStopWords += self.ws + w + self.ws + "|"
            # self.regexStopWords += w + self.ws + "|"
        self.regexStopWords = self.regexStopWords[:len(self.regexStopWords)-1]

    def cleanString(self,text):
        x = " " + text.lower()
        x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
        x = re.sub('(%s)' % re.escape(self.regexStopWords), ' ', x)
        print(x)
        return x

    def checkBalance(self):
        len1 = len(self.textsL1)
        len0 = len(self.textsL0)
        print('Label 1:',len1)
        print('Label 0:',len0)
        if len1 > len0:
            print('Their ratio:',round(abs(len1/len0),2))
        else:
            print('Their ratio:',round(abs(len0/len1),2))

    def nGrams(self, data, minN,maxN,n):
        for i in range(minN,maxN):
            cv = CountVectorizer(ngram_range = (i,i))
            if self.cleanData:
                cv = CountVectorizer(ngram_range = (i,i),preprocessor=self.cleanString)

            word_count_vector = cv.fit_transform(data)
            # print(word_count_vector)
            
            # Fit the model into the data
            tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True) 
            tfidf_transformer.fit(word_count_vector)
            
            df_idf = pd.DataFrame(tfidf_transformer.idf_, index = cv.get_feature_names(),columns = ["tfidf"])
            
            # sort ascending 
            print(df_idf.sort_values(by = ['tfidf'])[:n])
    
    def lengthCorrelation(self):
        lengths = [(lambda x: len(x[1]))(x) for x in self.texts.items()]
        lenLab = pd.DataFrame( list(zip(lengths, self.labels)), columns = ["length","labels"])
        print(lenLab.corr())

    def textTfidfValues(self,data):
        # this steps generates word counts for the words in your docs 
        cv = CountVectorizer()
        if self.cleanData:
            cv = CountVectorizer(preprocessor = self.cleanString)
        
        word_count_vector = cv.fit_transform(data)

        # print(word_count_vector)
        
        tfidf_transformer = TfidfTransformer(smooth_idf = True, use_idf = True)
        tfidf_transformer.fit(word_count_vector)
        
        self.tfidf = pd.DataFrame(tfidf_transformer.idf_, index = cv.get_feature_names(), columns = ["tfidf"])
        
        # sort ascending 
        # print(self.tfidf_transformer.idf_)
        return self.tfidf.sort_values(by = ["tfidf"])

# Load the SST data

In [94]:
train = SentAnalyzer("stsa.binary.phrases.train")

## Testing cleaning data

In [106]:
stop_words = ["the"]
regexStopWords = ""
ws = "( )"
for w in stop_words:
    regexStopWords += ws + w + ws + "|"
    # regexStopWords += w + ws + "|"
regexStopWords = regexStopWords[:len(regexStopWords)-1]
def cleanString(text):
    print("From:\n",text)
    x = " " + text.lower() + " "
    x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
    x = re.sub('(%s)' % re.escape(regexStopWords), ' ', x)
    print("To:\n",text)
    return x

text = "the worst revenge-of-the-nerds clichés the filmmakers could dredge up"
cleanString(text)

From:
 the worst revenge-of-the-nerds clichés the filmmakers could dredge up
To:
 the worst revenge-of-the-nerds clichés the filmmakers could dredge up


' the worst revenge of the nerds clichés the filmmakers could dredge up '

In [108]:
# train.data[["label","text"]].head(20)

In [95]:
n = 20
print("The", n ,"most common word overall and within the two label.")
print("Overall:")
print(train.textTfidfValues( train.texts.values ).iloc[:n])
# print("Label 0:")
# print(train.textTfidfValues(train.textsL0).iloc[:n])
# print("Label 1:")
# print(train.textTfidfValues(train.textsL1).iloc[:n])

The 20 most common word overall and within the two label.
Overall:
          tfidf
the    2.210299
and    2.367068
of     2.490469
to     2.807791
is     3.125087
that   3.243210
in     3.278603
it     3.374383
with   3.699859
film   3.829284
an     3.847267
for    3.871987
as     3.890294
its    3.916836
movie  3.927944
this   4.036873
but    4.166196
be     4.231125
on     4.323021
you    4.358615


In [204]:
# Get the n-grams
print("Whole database")
# train.nGrams(train.texts, 1, 4, 5)

Whole database


In [205]:
# Get the n-grams of Label 1
print("Label 1 sentences")
# train.nGrams(train.textsL1, 1, 4, 5)

Label 1 sentences


In [206]:
# Get the n-grams of Label 0
print("Label 0 sentences")
# train.nGrams(train.textsL0, 1, 4, 5)

Label 0 sentences


In [207]:
train.lengthCorrelation()

          length    labels
length  1.000000 -0.037769
labels -0.037769  1.000000


# Amazon Review data-set

In [4]:
import os
import json
import pandas as pd
import gzip
from urllib.request import urlopen

###################
## Amazon Review ##
###################

# Load in the Amazon Review Data with 5-core
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('Software_5.json.gz')
df = df.fillna('')
# print('shape', df.shape)

products = df["asin"].drop_duplicates()
reviewers = df["reviewerID"].drop_duplicates()
# print('Products\n',products)
# print('Reviewers\n',reviewers)

# df.loc[df['reviewerID'] == ]

In [216]:
df.head(20)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,4.0,False,"10 20, 2010",A38NELQT98S4H8,321719816,{'Format:': ' DVD-ROM'},WB Halper,I've been using Dreamweaver (and it's predeces...,A solid overview of Dreamweaver CS5,1287532800,,
1,4.0,False,"10 18, 2010",A3QJU4FEN8PQSZ,321719816,{'Format:': ' DVD-ROM'},Grimmy,"The demo is done with the PC version, with ref...",A good value,1287360000,,
2,5.0,False,"10 16, 2010",ACJT8MUC0LRF0,321719816,{'Format:': ' DVD-ROM'},D. Fowler,If you've been wanting to learn how to create ...,This is excellent software for those who want ...,1287187200,3.0,
3,5.0,False,"10 12, 2010",AYUF7YETYOLNX,321719816,{'Format:': ' DVD-ROM'},Bryan Newman,I've been creating websites with Dreamweaver f...,A Fantastic Overview of Dream Weaver and Web D...,1286841600,,
4,5.0,False,"10 7, 2010",A31ICLWQ9CSHRS,321719816,{'Format:': ' DVD-ROM'},Al Swanson,I decided (after trying a number of other prod...,Excellent Tutorials!,1286409600,,
5,5.0,False,"09 26, 2010",A2BVNVJOFXGZUB,321719816,{'Format:': ' DVD-ROM'},J. Howard,The video is well-paced and delivered in an un...,Excellent.,1285459200,,
6,5.0,False,"04 7, 2011",A2JMJVNTBL7K7E,321719816,{'Format:': ' DVD-ROM'},Yesuaini99,I spent several hours on the lesson and I love...,excellent video training material,1302134400,,
7,5.0,False,"01 8, 2011",A14JBDSWKPKTZA,321719816,{'Format:': ' DVD-ROM'},Bob Feeser,I have had Dreamweaver MX2004 since it came ou...,Great Video for a Difficult at Times Program,1294444800,,
8,5.0,False,"10 28, 2010",A2WCFDOCS73PNB,321719816,{'Format:': ' DVD-ROM'},Gadgetman,I have also taken a local community college on...,Excellent value for the price,1288224000,4.0,
9,5.0,False,"10 28, 2010",A14638TGYH7GD9,321719816,{'Format:': ' DVD-ROM'},Rebecca Haden,Even though I use Dreamweaver a great deal and...,Buy this with your copy of Dreamweaver,1288224000,6.0,


In [223]:
import math 

def simularityScore(person1,person2):
    # Movies and scores
    u1Reviews = df.loc[df["reviewerID"] == person1][['asin','overall']]
    u2Reviews = df.loc[df["reviewerID"] == person2][['asin','overall']]
    
    eDiff = 0
    # looping through each movie
    for movie in u1Reviews.values:
        u1score = u1Reviews.loc[u1Reviews["asin"] == movie[0]]['overall'].values[0]
        
        # if there is a match, then calculating the square difference
        if movie[0] in u2Reviews['asin'].values:
            u2score = u2Reviews.loc[u2Reviews["asin"] == movie[0]]['overall'].values[0]
            diff = u1score - u2score
            eDiff += diff**2
        # If the other haven't seen the same movie
        else:
            eDiff += u1score**2

    # returning it
    return math.sqrt(eDiff)

def getRecomForUserByProd(user,product,numRec = 5):
    # Others who reviewed the product
    reviewed = df.loc[df["asin"] == product][['reviewerID']]
    reviewedProd = reviewed
    
    # Get similarity array of relevant people
    simDict = []
    for u2 in reviewedProd.values:
        if user != u2:
            simDict += [(u2, simularityScore(user, u2[0] ))]
    
    # Similarity array in order
    simDict = sorted(simDict, key=lambda tup: tup[1])
    
    # Create recommendations
    seenProducts = df.loc[df.reviewerID == user][['asin']].values
    # The product needs to be good enough
    minRate = 3.5
    
    recommendations = []
    it = 0
    while len(recommendations) < numRec and it < 5:
        prods = df.loc[df.reviewerID == simDict[it][0][0] ][['asin','overall']].values
        
        for (asin,overall) in prods:
            # Not bad and unseen product
            if overall > minRate and asin not in seenProducts:
                recommendations += [asin]
            if len(recommendations) >= numRec:
                break
        it += 1
    # Give recommendations
    return recommendations

def getRecomForUser(user,numRec = 5):
    reviewers = df.reviewerID.drop_duplicates()

    simDict = []
    for u2 in reviewers.values:
        if user != u2:
            sim = simularityScore(user,u2)
            simDict += [(u2,sim)]
            # print("User1:", user," User2:", u2," sim:",sim)
    
    # Similarity array in order
    simDict = sorted(simDict, key = lambda tup: tup[1])
    # print('simDict: ',simDict[:20])
    # print('first: ',simDict[0])
    
    # Create recommendations
    seenProducts = df.loc[df.reviewerID == user][['asin']].values
    recommendations = []
    minRate = 3.5
    it = 0
    while len(recommendations) < numRec and it < len(simDict):
        prods = df.loc[df.reviewerID == simDict[it][0] ][['asin','overall']].values
        
        for (asin,overall) in prods:
            # Not bad and unseen product
            if overall > minRate and asin not in seenProducts:
                recommendations += [asin]
            if len(recommendations) >= numRec:
                break
        # it += 1
    return recommendations

## Calculate avarage number of ratings per user and product

In [224]:
# per user
reviewers = df["reviewerID"].drop_duplicates().values
rs = 0
for r in reviewers:
    rs += len(df.loc[df.reviewerID == r ].values)
print("Avarage number of ratings per user:", round(rs / len(reviewers) ,0) )

# per product
products = df["asin"].drop_duplicates().values
pr = 0
for p in products:
    pr += len(df.loc[df.asin == p ].values)
print("Avarage number of ratings per product:", round(pr / len(products) ,0) )

Avarage number of ratings per user: 7.0
Avarage number of ratings per product: 16.0


## Recommender System with Collaborative filtering

In [226]:
# Simularity score check
import time
person1 = "AYUF7YETYOLNX"
person2 = "A3QJU4FEN8PQSZ"
product1 = "0321700945"

# print("person1:", person1, "person2:", person2, "score:", simularityScore(person1,person2))
# print("person1:", person1, "person2:", person2, "score:", simularityScore(person2,person1))

# Recommending
t1 = time.time()
print(getRecomForUserByProd(person1,product1) )
t2 = time.time()
print("getRecomForUserByProd runtime:", round(t2-t1,5))
print(getRecomForUser(person1) )
t3 = time.time()
print("getRecomForUser runtime: ", round(t3-t2,5))

['B00NASFCDO', 'B003ZRTDPS', '0321700945', 'B000PC1IR2', 'B0017KEA6W']
getRecomForUserByProd runtime: 0.09634
['0321719824', 'B005LTV8G0', 'B008YTAGIK', 'B0148BYIPY', '0321719824']
getRecomForUser runtime:  15.47598


In [158]:
# df.loc[df['reviewerID'] == 'A38NELQT98S4H8']

In [159]:
# df.loc[df['asin'] == '0321719816']

In [160]:
# df.describe()

In [33]:
# pip install krippendorff

# Crowdsourcing Exercise

In [34]:
import krippendorff
import pandas as pd
import numpy as np

# importing libraries
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

class CrowdSourcer():
    
    def __init__(self, sourcePath):
        # Get data
        self.data = pd.read_csv(sourcePath)

        # The feature phrases removed (To separate rating)
        cols = []
        for i in range(29):
            cols.append("feature" + str(i + 1))

        
        self.ratings = self.data.drop(["Sentence Index","Ground Truth Labels"] + cols, axis = 1)
        self.groundTruth = self.data["Ground Truth Labels"]
        self.workers = self.ratings.columns
        # features = data.drop(["Sentence Index","Ground Truth Labels"] + workers.values[:], axis = 1)
        # print(self.ratings)

    def calcKrippendorff(self):
        missing = ""        
        kripCoef = krippendorff.alpha(reliability_data = self.ratings)
        print("Krippendorff coeficient: %.3f" % kripCoef)

    # Generate the prediction matrix out of the global sheet data.
    def trainByCS(self, X_train, y_train,voteType = "hard", solver = "lbfgs"):
        # group / ensemble of models
        self.lr = LogisticRegression(solver = solver, multi_class ="multinomial", max_iter = 200)
        self.svc = SVC(gamma = "auto", probability = True)
        self.dtc = DecisionTreeClassifier()
        self.estimator = [("LR", self.lr)),("SVC", self.svc)),("DTC", self.dtc)]
        
        # Voting Classifier with hard voting
        self.voters = VotingClassifier(estimators = self.estimator, voting = voteType)
        self.voters.fit(X_train, y_train)
        

    def checkPrediction(self,ratings,truths):
        y_pred = self.voters.predict(ratings)
        # print(whole)
        misses = 0
        for guess,truth in zip(y_pred, truths):
            # print(guess,truth)
            if guess != truth:
                misses += 1
        print(misses, "miss(es) out of",len(truths), " accurary:", str(100*round( 1-(misses/len(truths)) , 2)) + "%" )
        return str(100*round( 1-(misses/len(truths)) , 2))
        
collabData = CrowdSourcer("GlobalSheet.csv")

In [35]:
collabData.calcKrippendorff()

Krippendorff coeficient: 0.005


In [36]:
# Training and testing the crowdsourcing
testSize = 0.91

print("size of train data:", round( len(collabData.groundTruth)*(1-testSize)) )
print("size of test data:",  round( len(collabData.groundTruth)*testSize,0) )

#### TESTING PARAMETERS ####
# solvers = ["newton-cg","lbfgs"]
# for s in solvers:
#     print(s)
#     for j in range(2):
#         vote = "hard"
#         if j == 1:
#             vote = "soft"
#         print("with", vote,"vote")
#         for i in range(20):
#             X_train, X_test, y_train, y_test = train_test_split(collabData.ratings,
#                                                                 collabData.groundTruth,
#                                                                 test_size = testSize,
#                                                                 random_state = 42)

#             # collabData.trainByCS(X_train,y_train,"soft")
#             collabData.trainByCS(X_train,y_train,vote,s)
#             collabData.checkPrediction(X_test,y_test)
#############################

for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(collabData.ratings,
                                                        collabData.groundTruth,
                                                        test_size = testSize,
                                                        random_state = 42)

    collabData.trainByCS(X_train,y_train,"hard")
    collabData.checkPrediction(X_test,y_test)

size of train data: 9
size of test data: 92.0
3 miss(es) out of 92  accurary: 97.0%
4 miss(es) out of 92  accurary: 96.0%
4 miss(es) out of 92  accurary: 96.0%
2 miss(es) out of 92  accurary: 98.0%
4 miss(es) out of 92  accurary: 96.0%


In [37]:
# pip install gensim

In [119]:
from sklearn.base import TransformerMixin, BaseEstimator

class Debug(BaseEstimator, TransformerMixin):

    def transform(self, X):
        # print("Shape",X.shape)
        self.shape = X.shape
        # print(self.shape)
        return X

    def fit(self, X, y=None, **fit_params):
        return self

# Stanford Data-set (SST) - Rotten Tomatoes reviews
## Recommender System

In [120]:
import numpy as np
import time

# SKLearn
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split

class Recommender(SentAnalyzer):
    def __init__(self, sourcePath):
        SentAnalyzer.__init__(self, sourcePath)
        
        self.cleanData = True # (Overright)
        self.testSize = 0.5
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.texts,
                                                                                self.labels,
                                                                                test_size = self.testSize,
                                                                                random_state = 42)

    def getDocTermMtx(self,n_min = 1,n_max = 3):
        self.cv = CountVectorizer(ngram_range=(n_min, n_max), lowercase = True)
        if self.cleanData:
            self.cv = CountVectorizer(ngram_range=(n_min, n_max), lowercase = True, preprocessor = self.cleanString)

        self.cv.fit(self.texts.values)
        self.wcVector = self.cv.transform(self.texts.values)

        self.vocab = list(self.cv.vocabulary_.items())
        print('Vocabulary len:',len(self.vocab))
        print("Word Vector matrix:",self.wcVector.shape)

        # Unigram Tf-Idf
        
        self.tfidf = TfidfTransformer()
        self.tfidf.fit(self.wcVector)

        # Numericalize the train dataset
        self.docTermMtx = self.tfidf.transform(self.wcVector)
        print('Document-term matrix',self.docTermMtx.shape)
        
    def dimensionReduce(self,n):
        svd = TruncatedSVD(n_components = n, n_iter=7, random_state=42)
        svd.fit(self.docTermMtx)
        self.redDocTermMtx = svd.transform(self.docTermMtx)

        print(svd.explained_variance_ratio_)
        print()
        print(svd.explained_variance_ratio_.sum())
        print()
        print('Eigen values',svd.singular_values_)
        print()
        print('redDocTermMtx',self.redDocTermMtx)

    def linReg(self):
        self.lr = LogisticRegression(random_state = 0)
        self.lrResult = self.lr.fit(self.X_train.values,self.y_train)

    def withPipeline(self, n_min = 1, n_max = 1, svd_iter = 7, predictor = "sgd", useDimRed = True, nDim = 100):
        start = time.time()

        # Pipeline components
        pipeArray = []
        if self.cleanData:
            self.cv = CountVectorizer(ngram_range=(n_min, n_max), lowercase = True, preprocessor = self.cleanString)
            pipeArray.append(("CV", self.cv))
        else:
            self.cv = CountVectorizer(ngram_range=(n_min, n_max), lowercase = True)
            pipeArray.append(("CV", self.cv))
        
        self.tfidf = TfidfTransformer()
        pipeArray.append(("TF-IDF", self.tfidf))
        
        # Print doc-term-mtx info
        self.debug = Debug()
        pipeArray.append(("Debug", self.debug))
        
        if useDimRed:
            self.svd = TruncatedSVD(n_components = nDim, n_iter = svd_iter, random_state = 42)
            pipeArray.append(("SVD", self.svd))
        
        # Predictor used
        if predictor == "lr":
            self.lr = LogisticRegression(random_state = 0)
            pipeArray.append(("LR", self.lr))
        elif predictor == "sgd":
            self.sgd = SGDClassifier(max_iter=1000, tol=1e-3)
            pipeArray.append(("SGD", self.sgd))
        
        # Process pipeline
        self.pipe = Pipeline(pipeArray)
        self.result = self.pipe.fit(self.X_train.values, self.y_train)
        end = time.time()
        runtime = round(end - start, 2)

        # Test
        if useDimRed:
            print('With dims:', nDim ,', tf-idf dims:', self.debug.shape , "using", predictor ,", runtime:",runtime)
        else:
            print('Without dim reduction, tf-idf dims:', self.debug.shape , "using", predictor ,", runtime:",runtime)
        self.test()

    def test(self, predicter = None):
        if predicter == None:
            predicter = self.pipe
        self.pred = predicter.predict(self.X_test.values)
        self.rightAns = 0
        for pred, ans in zip(self.pred,self.y_test.values):
            if pred == ans:
                self.rightAns += 1
        print('Right answers:',self.rightAns, 'of',len(self.pred),
                'so', str(round(100*self.rightAns/len(self.pred),2)) + "%" )
            
    def predictWithModels(self):
        self.model = Word2Vec.load("word2vec.model")
        self.model.train(self.X_train, total_examples=1, epochs=1)
        print("train score:", model.score(self.X_train, self.y_train))
        print("test score:", model.score(self.X_test, self.y_test))

In [121]:
sstRec = Recommender("stsa.binary.phrases.train")

In [41]:
# sstRec.getDocTermMtx()

In [42]:
# print(cleanData[:5].values)

In [43]:
import nltk

nltk.download()
# sstRec.texts.values

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [None]:
import time
sstRec.cleanData = False
sstRec.withPipeline()
# With dims: 20 , tf-idf dims: (38480, 14332) using LR, runtime 4.49
# Right answers: 25438 of 38481 so 66.11%

In [122]:
sstRec.cleanData = False
sstRec.withPipeline(predictor="lr")

With dims: 100 , tf-idf dims: (38480, 14332) using lr , runtime: 5.31
Right answers: 25960 of 38481 so 67.46%


In [123]:
sstRec.cleanData = True
sstRec.withPipeline()

With dims: 100 , tf-idf dims: (38480, 14332) using sgd , runtime: 26.14
Right answers: 24829 of 38481 so 64.52%


In [124]:
# SGD
sstRec.cleanData = True
sstRec.withPipeline(nDim=200,n_max=3)

With dims: 200 , tf-idf dims: (38480, 175637) using sgd , runtime: 56.89
Right answers: 27088 of 38481 so 70.39%


In [125]:
# LR
sstRec.cleanData = True
sstRec.withPipeline(nDim=200,predictor="lr")

With dims: 200 , tf-idf dims: (38480, 14332) using lr , runtime: 29.58
Right answers: 27248 of 38481 so 70.81%


In [126]:
# LR
sstRec.cleanData = True
sstRec.withPipeline(nDim=1000,predictor="lr",n_max=3)

With dims: 1000 , tf-idf dims: (38480, 175637) using lr , runtime: 261.97
Right answers: 30117 of 38481 so 78.26%


In [None]:
import time

start = time.time()
sstRec.withPipeline(n_min = 1, n_max = 1) # 65.1%
sstRec.withPipeline(n_min = 1, n_max = 2) # 64.15%
sstRec.withPipeline(n_min = 1, n_max = 3) # 63.79%
sstRec.withPipeline(n_min = 1, n_max = 4) # 63.71%
end = time.time()
print("runtime:",round(end-start,2),"msp")

In [None]:
# sstRec.

In [None]:
sstRec.getDocTermMtx()
sstRec.linReg()

In [None]:
sstRec.dimensionReduce(100)

In [None]:
# Here I tried the differences between 
# the CountVectorizer with different parameters

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Unigram Counts

unigram_vectorizer = CountVectorizer(ngram_range=(1, 1))
unigram_vectorizer.fit(sstRec.texts.values)

X_train_unigram = unigram_vectorizer.transform(sstRec.texts.values)

# Unigram Tf-Idf
unigram_tf_idf = TfidfTransformer()
unigram_tf_idf.fit(X_train_unigram)

X_train_unigram_tf_idf = unigram_tf_idf.transform(X_train_unigram)

# trigram Counts
trigram_vectorizer = CountVectorizer(ngram_range=(1, 3))
trigram_vectorizer.fit(sstRec.texts.values)

X_train_trigram = trigram_vectorizer.transform(sstRec.texts.values)


# trigram Tf-Idf
trigram_tf_idf = TfidfTransformer()
trigram_tf_idf.fit(X_train_trigram)

X_train_trigram_tf_idf = trigram_tf_idf.transform(X_train_trigram)

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
import numpy as np

def train_and_show_scores(X, y, title) -> None:
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, train_size = 0.75, stratify = y
    )

    clf = SGDClassifier()
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    valid_score = clf.score(X_valid, y_valid)
    print(f'{title}\nTrain score: {round(train_score, 2)} ; Validation score: {round(valid_score, 2)}\n')

y_train = sstRec.labels.values

train_and_show_scores(X_train_unigram, y_train, 'Unigram Counts')
train_and_show_scores(X_train_unigram_tf_idf, y_train, 'Unigram Tf-Idf')
train_and_show_scores(X_train_trigram, y_train, 'Trigram Counts')
train_and_show_scores(X_train_trigram_tf_idf, y_train, 'Trigram Tf-Idf')

train_and_show_scores(X_train_unigram, y_train, 'Unigram Counts')
train_and_show_scores(X_train_unigram_tf_idf, y_train, 'Unigram Tf-Idf')
train_and_show_scores(X_train_trigram, y_train, 'Trigram Counts')
train_and_show_scores(X_train_trigram_tf_idf, y_train, 'Trigram Tf-Idf')