In [6]:

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split



In [8]:
import time
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
import nltk
from IPython.display import IFrame
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
import os
from sklearn.ensemble import RandomForestClassifier

Data Loading and Merging from GitHub

In [9]:
base_url = "https://raw.githubusercontent.com/PietroParenti/youtube-comments-classifier/main/"

file_names = [
    "Youtube01-Psy.csv",
    "Youtube02-KatyPerry.csv",
    "Youtube03-LMFAO.csv",
    "Youtube04-Eminem.csv",
    "Youtube05-Shakira.csv"
]

urls = [base_url + fname for fname in file_names]

dataframes = [pd.read_csv(url) for url in urls]

df_original = pd.concat(dataframes, ignore_index=True)

df_original.loc[df_original.index<5,:]

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: ...",1
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,Hey guys check out my new channel and our firs...,1
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,1
3,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿,1


Dataset Overview

In [10]:
df = df_original

print("="*50)
print("Number of rows: ")
print(df.shape[0])

print("\n" + "="*50)
print("Number of columns: ")
print(df.shape[1])

print("\n" + "="*50)
print('Column names')
print(list(df.columns))

Number of rows: 
1956

Number of columns: 
5

Column names
['COMMENT_ID', 'AUTHOR', 'DATE', 'CONTENT', 'CLASS']


Missing Values Check

In [11]:
print("="*50)
for col in ['COMMENT_ID', 'AUTHOR', 'DATE', 'CONTENT', 'CLASS']:
    missingValueMask = df[col].isna()
    num_missing = missingValueMask.sum()

    if num_missing == 0:
        print(f'No missing values in column {col}')
    else:
        print(f'{num_missing} missing values in column {col}')

No missing values in column COMMENT_ID
No missing values in column AUTHOR
245 missing values in column DATE
No missing values in column CONTENT
No missing values in column CLASS


Presence of missing values only in date column. The date column will not be used in analisys, so NA are not removed

Class Distribution

In [12]:
print("="*50)
print("Observation count per class:")
print(df['CLASS'].value_counts())
print()

print("\n" + "="*50)
print("Observation percentage per class:")
print(df['CLASS'].value_counts(normalize=True) * 100)


Observation count per class:
CLASS
1    1005
0     951
Name: count, dtype: int64


Observation percentage per class:
CLASS
1    51.380368
0    48.619632
Name: proportion, dtype: float64


The dataset is unbalanced. To implement the ml algoritm is better to have balanced data.

Downsampling

In [13]:
df_unbalanced=df


def underSample2Min(df, labelName):
    vc = df.loc[:,labelName].value_counts()
    lab2freq = dict(zip(vc.index.tolist(), vc.values.tolist()))
    minfreq = min(lab2freq.values())
    idxSample=[]
    for selectedLabel, actualFreq in lab2freq.items():
        selIndexes=df.loc[df.loc[:,labelName]==selectedLabel, :].sample(n=minfreq).index.tolist()
        idxSample+=selIndexes
    idxSample.sort()

    df2 = df.loc[idxSample, :]
    df2 = df2.reset_index()
    return df2

df = underSample2Min(df_unbalanced, 'CLASS')

print("="*50)
print("Observation count per class:")
print(df['CLASS'].value_counts())

Observation count per class:
CLASS
1    951
0    951
Name: count, dtype: int64


Splitting the dataset into train and test

In [14]:
xAll = df.loc[:,'CONTENT']
yAll = df.loc[:,'CLASS']
print('type(xAll)', type(xAll), 'xAll.shape', xAll.shape)
print('type(yAll)', type(yAll), 'xAll.shape', yAll.shape)
print()

xTrainVec, xTestVec, yTrain, yTest = train_test_split(
    xAll, yAll,
    test_size=0.30,
    random_state=0,
    stratify=yAll
)

varDi = {'xAll':xAll, 'yAll':yAll, 'xTrainVec':xTrainVec, 'xTestVec':xTestVec, 'yTrain':yTrain, 'yTest':yTest}
for varName, var in varDi.items(): # return a list of key-value pairs
  print(varName, var.shape)

xTrain = list(xTrainVec)
xTest = list(xTestVec)

type(xAll) <class 'pandas.core.series.Series'> xAll.shape (1902,)
type(yAll) <class 'pandas.core.series.Series'> xAll.shape (1902,)

xAll (1902,)
yAll (1902,)
xTrainVec (1331,)
xTestVec (571,)
yTrain (1331,)
yTest (571,)


Simple pipeline

In [15]:
cp = Pipeline(
    [
      ('vectorizer', CountVectorizer()),
      ('classifier', LogisticRegression() ),
    ]
)

clsfParams = {
   'classifier__C': 0.001,
   'vectorizer__stop_words':'english',
   'vectorizer__ngram_range': (1,1),
}


cp.set_params(**clsfParams)
cp.fit(xTrain, yTrain)

yPred=cp.predict(xTest)

print("="*50)
print('Classification report')
clasRepSt01 = classification_report(yTest,yPred)
print(clasRepSt01)

print('Accuracy')
print(accuracy_score(yTest,yPred))
print("="*50)

Classification report
              precision    recall  f1-score   support

           0       0.79      0.97      0.87       286
           1       0.96      0.75      0.84       285

    accuracy                           0.86       571
   macro avg       0.88      0.86      0.86       571
weighted avg       0.88      0.86      0.86       571

Accuracy
0.8598949211908932


Optimising Text Pipeline Components

In [16]:
class BaseWrapper(BaseEstimator, TransformerMixin):
    """class wrapping a sentence processing function so that it can be used in a sklearn.pipeline.Pipeline"""

    def fit(self, x, y=None): #This method is usually overridden in children classes
        """ This method actually does nothing.
        It will be overridden by the child classes.
        In its children implementations this method will perform all the
        setup activities required before calling either the method transform()
        or the method predict().
        E.g., a machine learning classifier is trained calling the fit() method,
        once trained it can be used to classify new elements by calling the method predict()
        """
        return self

    def manageSentence(self, sentence): #This method is usually overridden in children classes
        """Called by transform(). The sentence is expected to be a either a string or a list of words,
        this method can return either a string or a list of words"""
        return sentence

    def transform(self, listOfSentences):
        """ sentenceList: list of sentences.
        Every sentence can be either a string or a list of words
        Return a list of lists. Each sentence is preprocessed using the manageSentence() method.
        Each child class can override the manageSentence() method to implement a specific preprocessing behavior.
        The list of preprocessed documents is returned."""
        toReturn = []
        for sentence in listOfSentences:
            processedSentence = self.manageSentence(sentence)
            toReturn.append(processedSentence)
        return toReturn

class HTMLAccentsReplacer(BaseWrapper):
    def manageSentence(self, sentence):
        """Replace html representations of special letters with the corresponding unicode character.
        E.g.  &agrave with à.
        Args:
           * s(string): the string where the html codes should be replaced  """
        assert type(sentence)==type('') or type(sentence)==type(u''), "HTMLAccentsReplacer Assertion Error"
        replacemap={'&Ecirc;': 'Ê', '&raquo;': '»', '&eth;': 'ð', '&divide;': '÷', '&atilde;': 'ã', '&Aelig;':
                    'Æ', '&frac34;': '¾', '&nbsp;': ' ', '&Aumbl;': 'Ä', '&Ouml;': 'Ö', '&Egrave;': 'È', '&Icirc;': 'Î',
                    '&deg;': '°', '&ocirc;': 'ô', '&Ugrave;': 'Ù', '&ndash;': '–', '&gt;': '>', '&Thorn;': 'Þ',
                    '&aring;': 'å', '&frac12;': '½', '&frac14;': '¼', '&Aacute;': 'Á', '&szlig;': 'ß', '&trade;': '™',
                    '&igrave;': 'ì', '&aelig;': 'æ', '&times;': '×', '&egrave;': 'è', '&Atilde;': 'Ã', '&Igrave;': 'Ì',
                    '&Eth;': 'Ð', '&ucirc;': 'û', '&lsquo;': '‘', '&agrave;': 'à', '&thorn;': 'þ', '&Ucirc;': 'Û',
                    '&amp;': '&', '&uuml;': 'ü', '&yuml;': '', '&ecirc;': 'ê', '&laquo;': '«', '&infin;': '∞',
                    '&Ograve;': 'Ò', '&oslash;': 'ø', '&yacute;': 'ý', '&plusmn;': '±', '&icirc;': 'î', '&auml;': 'ä',
                    '&ouml;': 'ö', '&Ccedil;': 'Ç', '&euml;': 'ë', '&lt;': '<', '&eacute;': 'é', '&ntilde;': 'ñ',
                    '&pound;': '£', '&Iuml;': 'Ï', '&Eacute;': 'É', '&Ntilde;': 'Ñ', '&rsquo;': '’', '&euro;': '€',
                    '&rdquo;': '”', '&Acirc;': 'Â', '&ccedil;': 'ç', '&Iacute;': 'Í', '&quot;': '"', '&Aring;': 'Å',
                    '&Oslash;': 'Ø', '&Otilde;': 'Õ', '&Uacute;': 'Ú', '&reg;': '®', '&Yacute;': 'Ý', '&iuml;': 'ï',
                    '&ugrave;': 'ù', '&alpha;': 'α', '&copy;': '©', '&ldquo;': '“', '&oacute;': 'ó', '&Euml;': 'Ë',
                    '&uacute;': 'ú', '&ograve;': 'ò', '&acirc;': 'â', '&aacute;': 'á', '&Agrave;': 'À', '&Oacute;': 'Ó',
                    '&Uuml;': 'Ü', '&iacute;': 'í', '&cent;': '¢', '&Ocirc;': 'Ô', '&mdash;': '—', '&otilde;': 'õ',
                    '&beta;': 'β'
        }
        '''
        replacemap={u'&Ecirc;': u'\xca', u'&raquo;': u'\xbb', u'&eth;': u'\xf0', u'&divide;': u'\xf7',
                    u'&atilde;': u'\xe3', u'&Aelig;': u'\xc6', u'&frac34;': u'\xbe', u'&nbsp;': u' ',
                    u'&Aumbl;': u'\xc4', u'&Ouml;': u'\xd6', u'&Egrave;': u'\xc8', u'&Icirc;': u'\xce',
                    u'&deg;': u'\xb0', u'&ocirc;': u'\xf4', u'&Ugrave;': u'\xd9', u'&ndash;': u'\u2013',
                    u'&gt;': u'>', u'&Thorn;': u'\xde', u'&aring;': u'\xe5', u'&frac12;': u'\xbd',
                    u'&frac14;': u'\xbc', u'&Aacute;': u'\xc1', u'&szlig;': u'\xdf', u'&trade;': u'\u2122',
                    u'&igrave;': u'\xec', u'&aelig;': u'\xe6', u'&times;': u'\xd7', u'&egrave;': u'\xe8',
                    u'&Atilde;': u'\xc3', u'&Igrave;': u'\xcc', u'&Eth;': u'\xd0', u'&ucirc;': u'\xfb',
                    u'&lsquo;': u'\u2018', u'&agrave;': u'\xe0', u'&thorn;': u'\xfe', u'&Ucirc;': u'\xdb',
                    u'&amp;': u'&', u'&uuml;': u'\xfc', u'&yuml;': u'', u'&ecirc;': u'\xea', u'&laquo;': u'\xab',
                    u'&infin;': u'\u221e', u'&Ograve;': u'\xd2', u'&oslash;': u'\xf8', u'&yacute;': u'\xfd',
                    u'&plusmn;': u'\xb1', u'&icirc;': u'\xee', u'&auml;': u'\xe4', u'&ouml;': u'\xf6',
                    u'&Ccedil;': u'\xc7', u'&euml;': u'\xeb', u'&lt;': u'<', u'&eacute;': u'\xe9',
                    u'&ntilde;': u'\xf1', u'&pound;': u'\xa3', u'&Iuml;': u'\xcf', u'&Eacute;': u'\xc9',
                    u'&Ntilde;': u'\xd1', u'&rsquo;': u'\u2019', u'&euro;': u'\u20ac', u'&rdquo;': u'\u201d',
                    u'&Acirc;': u'\xc2', u'&ccedil;': u'\xe7', u'&Iacute;': u'\xcd', u'&quot;': u'"',
                    u'&Aring;': u'\xc5', u'&Oslash;': u'\xd8', u'&Otilde;': u'\xd5', u'&Uacute;': u'\xda',
                    u'&reg;': u'\xae', u'&Yacute;': u'\xdd', u'&iuml;': u'\xef', u'&ugrave;': u'\xf9',
                    u'&alpha;': u'\u03b1', u'&copy;': u'\xa9', u'&ldquo;': u'\u201c', u'&oacute;': u'\xf3',
                    u'&Euml;': u'\xcb', u'&uacute;': u'\xfa', u'&ograve;': u'\xf2', u'&acirc;': u'\xe2',
                    u'&aacute;': u'\xe1', u'&Agrave;': u'\xc0', u'&Oacute;': u'\xd3', u'&Uuml;': u'\xdc',
                    u'&iacute;': u'\xed', u'&cent;': u'\xa2', u'&Ocirc;': u'\xd4', u'&mdash;': u'\u2014',
                    u'&otilde;': u'\xf5', u'&beta;': u'\u03b2'}
        '''
        for before in replacemap:
            after=replacemap[before] # getting the string to be replaced
            sentence=sentence.replace(before, after)
        return sentence

'''
# Required in python2, no more necessary in python3
class Str2Unicode(BaseWrapper):
    def manageSentence(self, sentence):
        """Converts raw strings to unicode, to better manage accented letters, money symbols (e.g., pounds)"""
        #print(type(sentence), sentence) # ****** cancellami
        # if the parameter is not the right type, the execution is interrupted
        assert type(sentence)==type('') or type(sentence)==type(u''), "Str2Unicode Assertion Error"
        if type(sentence)==type(u''): # Now it should work also with python3
            return sentence
        elif type(sentence)==type(''):
            return sentence.decode('utf-8', errors='strict')  # interpret all raw strings into unicode
        else:
            return sentence
'''

class Tokenizer(BaseWrapper):
    def manageSentence(self, sentence):
        """This method turn a single document (i.e., a string) into a list of single words (i.e., tokens).
        The parameter "sentence" is expected to be a string, this method returns a list of strings whereas each string
        is a tokenized word. This method replaces all the punctuation with spaces.
        Two or more consecuitve spaces are reduced to a single space.
        Then the strin is splitted in substring using the spaces as split markers"""

        if sentence==None:
            return[]
        # if the parameter is not the right type, the execution is interrupted
        assert type(sentence)==type('') or type(sentence)==type(u''), "Tokenizer Assertion Error"
        punteggiatura=u'!{}[]?"",;.:-<>|/\\*=+-_% \n\t\r()'+u"'" +u'\u2019'+u'\u2018' #\r and \n can be used as "new line"
        # Unicode Character 'RIGHT SINGLE QUOTATION MARK' (U+2019)
        #
        for l in punteggiatura:
           #print(s)
           sentence=sentence.replace(l,u" ") #replacing all punctuation characters with spaces

        # loop untill all double spaces are removed
        while sentence.find(u"  ")!=-1:
            sentence=sentence.replace(u"  ",u" ")  #replacing double spaces with a single one
        return sentence.split(u' ')   #e.g., "a b c d".split(' ')  returns ['a','b','c','d']

class LowerCaseReducer(BaseWrapper):
    def manageSentence(self, sentence):
        """sentence is expected to be a list of words (each item is a string),
        this method returns a list of strings whereas each string is the lower case version of the original word"""
        # preliminary check over the input data type
        assert type(sentence)==type([]), "LowerCaseReducer, Assertion Error"
        # The next line uses a python trick called List Comprehensions.
        return [w.lower() for w in sentence]
        # builds a new list, where each word of the original list is turned into a lower case string


class EnglishStopWordsRemover(BaseWrapper):
    def getStopWords(self):
        """This method returns a list of English stop words. Stop words can be added to the list"""
        return [u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves',
                u'you', u'your', u'yours', u'yourself', u'yourselves',
                u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself',
                u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves',
                u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those',
                u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being',
                u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing',
                u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while',
                u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through',
                u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out',
                u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when',
                u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other',
                u'some', u'such', u'no', u'nor', u'not', u'only', u'own', u'same', u'so', u'than', u'too', u'very',
                u's', u't', u'can', u'will', u'just', u'don', u'should', u'now', u'd', u'll', u'm', u'o', u're',
                u've', u'y', u'ain', u'aren', u'couldn', u'didn', u'doesn', u'hadn', u'hasn', u'haven', u'isn',
                u'ma', u'mightn', u'mustn', u'needn', u'shan', u'shouldn', u'wasn', u'weren', u'won', u'wouldn']

    def manageSentence(self, sentence):
        """sentence is expected to be a list of words (a list where each item is a string containing a single word),
        this method returns the input list where the stop words are removed """
        assert type(sentence)==type([]) , "EnglishStopWordsRemover, Assertion Error"
        stopWords = self.getStopWords()
        return [w for w in sentence if w not in stopWords]

class EnglishStemmer(BaseWrapper):
    def __init__(self):
        """Load the NLTK English stemmer. A stemmer is an algorithm that recues a word to its base form
        e.g., "books" is reduced to "book", 'children' is reduced to 'child'. """
        self.st = nltk.stem.SnowballStemmer("english") # loading the NLTK stemmer
    def  manageSentence(self, sentence):
        """sentence is expected to be a list of words (a list where each item is a string containing a single word),
        this method returns a list of stemmed words"""
        assert type(sentence)==type([]), "EnglishStemmer, Assertion Error"
        return [self.st.stem(w) for w in sentence]


class RemoveNumbers(BaseWrapper):
    def manageSentence(self, sentence):
        """Sentence is expected to be a list of words (a list where each item is a string containing a single word),
        this method returns the input list where the numbers are removed. """
        assert type(sentence)==type([]), "RemoveNumbers, Assertion Error"
        return [w for w in sentence if w.isdigit()==False]

class RemoveEmptyWords(BaseWrapper):
    def manageSentence(self, sentence):
        """Sentence is expected to be a list of words (a list where each item is a string containing a single word),
        this method returns the input list where the empty words are removed """
        assert type(sentence)==type([]), "RemoveEmptyWords, Assertion Error"
        return [w for w in sentence if not (w==u'' or w=='')]

class Bag2Text(BaseWrapper):
    def manageSentence(self, sentence):
        """sentence is expected to be a list of words (a list where each item is a string containing a single word),
        this method returns a single string obtained joining the words and separing them using the space"""
        assert type(sentence)==type([]), "Bag2Text, Assertion Error"
        # Next line builds a string by joining with spaces all the elements of sentence
        return u' '.join(sentence)

def unityFunction(x):
  """This function returns the same object received as input.
  For advanced pythonists: equivalent to lambda x:x """
  return x

In [17]:
cp = Pipeline([
   #('Str2Unicode', Str2Unicode()), no more required in Python3
   ('HTMLAccentsReplacer', HTMLAccentsReplacer() ),
   ('Tokenizer', Tokenizer() ),
   ('LowerCaseReducer', LowerCaseReducer() ),
   ('StopWordsRemover', EnglishStopWordsRemover() ),
   ('Stemmer', EnglishStemmer() ),
   ('RemoveNumbers', RemoveNumbers() ),
   ('RemoveEmptyWords', RemoveEmptyWords() ),
   #('Bag2Text', Bag2Text() ),
   ('vectorizer', CountVectorizer()),
   ('classifier', LogisticRegression() ), # LinearSVC
                           ])

# Collecting all the param values in a single data structure.
# The parameter keyword should be composed as follows: pipelineComponentName + '__' + paramName.
clsfParams = {
   'classifier__C': 0.001,
   'vectorizer__preprocessor': unityFunction, # since we provided a customized preprocessing pipeline, we turn off the usual preprocessing pipeline
   'vectorizer__tokenizer': unityFunction, # Same as above.
   'vectorizer__token_pattern': None, # to prevent a warning. This is used in combination with a custom tokenizer
   'vectorizer__ngram_range': (1,1),

}

cp.set_params(**clsfParams)
cp.fit(xTrain, yTrain)

yPred=cp.predict(xTest)

print("="*50)
print('Classification report')
clasRepSt02 = classification_report(yTest,yPred)
print(clasRepSt02)
print('Accuracy')
print(accuracy_score(yTest,yPred))
print("="*50)

Classification report
              precision    recall  f1-score   support

           0       0.79      0.98      0.87       286
           1       0.97      0.74      0.84       285

    accuracy                           0.86       571
   macro avg       0.88      0.86      0.86       571
weighted avg       0.88      0.86      0.86       571

Accuracy
0.8581436077057794


Grid search

In [18]:
cp = Pipeline([
   #('Str2Unicode', Str2Unicode()),
   ('HTMLAccentsReplacer', HTMLAccentsReplacer() ),
   ('Tokenizer', Tokenizer() ),
   ('LowerCaseReducer', LowerCaseReducer() ),
   ('StopWordsRemover', EnglishStopWordsRemover() ),
   ('Stemmer', EnglishStemmer() ),
   ('RemoveNumbers', RemoveNumbers() ),
   ('RemoveEmptyWords', RemoveEmptyWords() ),
   #('Bag2Text', Bag2Text() ),
   ('vectorizer', CountVectorizer()),
   ('classifier', LogisticRegression() ),  # LinearSVC
                           ])

# Setting for each parameter the value space (i.e., the set of values to evaluate)
paramSpace = {
   'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100], # Values for grid search should be enclosed by []
   'classifier__solver': ['liblinear'],
   'classifier__class_weight': [None, 'balanced'], # if the classes were imbalanced, we could try this approach
   'vectorizer__preprocessor': [unityFunction], # since we provided a customized preprocessing pipeline, we turn off the usual preprocessing pipeline
   'vectorizer__tokenizer': [unityFunction], # Same as above.
   'vectorizer__token_pattern': [None], # to prevent warnings. This is used in combination with a custom tokenizer
   'vectorizer__ngram_range': [(1,1), (1,2), (1,3)],   #
   'vectorizer__max_df': [0.7],  # If a term is in more of the 70% of documents, it is too frequent to be discriminative
   'vectorizer__min_df': [2, 4], # Minimum number of documents where the term should appear (otherwise it won't be considered in the Vocabulary)
} # a python list [] is mandatory even if only one element is in

start_time = time.time()
# cv=4 k-fold validation, con k=4
gs = GridSearchCV(cp, param_grid=paramSpace, scoring='accuracy', cv=4)
gs.fit(xTrain,yTrain)

print("--- %s seconds ---" % (time.time() - start_time))
print(gs.best_params_)
print('Scoring result')
print(gs.best_score_)

--- 85.32162809371948 seconds ---
{'classifier__C': 1, 'classifier__class_weight': None, 'classifier__solver': 'liblinear', 'vectorizer__max_df': 0.7, 'vectorizer__min_df': 2, 'vectorizer__ngram_range': (1, 3), 'vectorizer__preprocessor': <function unityFunction at 0x7a43502f5ee0>, 'vectorizer__token_pattern': None, 'vectorizer__tokenizer': <function unityFunction at 0x7a43502f5ee0>}
Scoring result
0.948168801331452


Classification report of best pipeline for gridsearch

In [19]:
clsfParams = {
   'classifier__C': 10,
   'classifier__class_weight': None,
   'classifier__solver': 'liblinear',
   'vectorizer__max_df': 0.7,
   'vectorizer__min_df': 4,
   'vectorizer__ngram_range': (1, 3),
   'vectorizer__preprocessor': unityFunction,
   'vectorizer__token_pattern': None,
   'vectorizer__tokenizer': unityFunction
}
cp.set_params(**clsfParams)
cp.fit(xTrain, yTrain)
yPred=cp.predict(xTest)

print("="*50)
print('Classification report')
print(classification_report(yTest,yPred))
print('Accuracy')
print(accuracy_score(yTest,yPred))
print("="*50)

Classification report
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       286
           1       0.97      0.94      0.96       285

    accuracy                           0.96       571
   macro avg       0.96      0.96      0.96       571
weighted avg       0.96      0.96      0.96       571

Accuracy
0.9597197898423818


In [21]:
# Creating the Document Term Matrix from xTrain and xTest
cv = CountVectorizer(
    ngram_range = (1, 3), # Those values were identified as optimal from a previous notebook
    min_df = 4,
    max_df = 0.7
)
xTrainDtm = cv.fit_transform(xTrain)
xTestDtm = cv.transform(xTest)

Automated Machine Learning Pipeline Optimization with TPOT

Genetic programming is used to obtain the optimal pipeline.

In [26]:
! pip install tpot==0.12.2
from tpot import TPOTClassifier

tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    verbosity=2,
    scoring='accuracy',
    cv=3,
    n_jobs=-1,
    config_dict='TPOT sparse',
    random_state=42
)

tpot.fit(xTrainDtm, yTrain)

Collecting scikit-learn>=1.4.1 (from tpot==0.12.2)
  Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m96.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.1.3
    Uninstalling scikit-learn-1.1.3:
      Successfully uninstalled scikit-learn-1.1.3
Successfully installed scikit-learn-1.6.1


Exception ignored on calling ctypes callback function: <function ThreadpoolController._find_libraries_with_dl_iterate_phdr.<locals>.match_library_callback at 0x7a428e2d71a0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/threadpoolctl.py", line 1005, in match_library_callback
    self._make_controller_from_path(filepath)
  File "/usr/local/lib/python3.11/dist-packages/threadpoolctl.py", line 1187, in _make_controller_from_path
    lib_controller = controller_class(
                     ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/threadpoolctl.py", line 114, in __init__
    self.dynlib = ctypes.CDLL(filepath, mode=_RTLD_NOLOAD)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/ctypes/__init__.py", line 376, in __init__
    self._handle = _dlopen(self._name, mode)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^
OSError: dlopen() error


is_classifier
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor




is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_classifier
is_regressor
is_classifier


Version 0.12.2 of tpot is outdated. Version 1.0.0 was released Wednesday February 26, 2025.


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9331340369715088

Generation 2 - Current best internal CV score: 0.9368945695130796

Generation 3 - Current best internal CV score: 0.9444054664144957

Generation 4 - Current best internal CV score: 0.9496624163667052

Generation 5 - Current best internal CV score: 0.9496624163667052

Best pipeline: LinearSVC(input_matrix, C=0.5, dual=True, loss=hinge, penalty=l2, tol=1e-05)


In [27]:
# 1. Visualizza la pipeline ottimizzata
print("\n" + "="*50)
print("Pipeline ottimizzata:")
print(tpot.fitted_pipeline_)

# 2. Calcola e visualizza le metriche
yPred = tpot.predict(xTestDtm)  # Usa direttamente il modello addestrato

print("\n" + "="*50)
print("Classification report:")
print(classification_report(yTest, yPred))

print("\nAccuracy:")
print(accuracy_score(yTest, yPred))
print("="*50 + "\n")


Pipeline ottimizzata:
Pipeline(steps=[('linearsvc',
                 LinearSVC(C=0.5, dual=True, loss='hinge', random_state=42,
                           tol=1e-05))])

Classification report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.96       286
           1       0.98      0.93      0.95       285

    accuracy                           0.96       571
   macro avg       0.96      0.96      0.96       571
weighted avg       0.96      0.96      0.96       571


Accuracy:
0.9562171628721541

