# BRIEF : THE WHEEL OF EMOTIONS

## Ressources
https://www.actuia.com/contribution/victorbigand/tutoriel-tal-pour-les-debutants-classification-de-texte/ <br>
https://realpython.com/sentiment-analysis-python/#how-classification-works <br>
https://www.datacamp.com/community/tutorials/stemming-lemmatization-python <br>
https://www.machinelearningplus.com/nlp/lemmatization-examples-python/

## Importation des données

In [13]:
import pandas as pd
import string
import numpy as np

from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ShuffleSplit
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import SGDClassifier, LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB, CategoricalNB, ComplementNB, BernoulliNB
from sklearn.decomposition import FastICA, KernelPCA, TruncatedSVD, SparsePCA, NMF, FactorAnalysis, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler


from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import ngrams

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


import time
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline




[nltk_data] Downloading package stopwords to
[nltk_data]     /home/helloworld/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/helloworld/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/helloworld/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Chargement des données

In [2]:
data = pd.read_csv('Emotion_final.csv')

## Mise à l'échelle pour intégration 

In [3]:
x = list(data['Text'])

## Ntlk : Tokenisation, stopwords, lemmatization, n-grams

In [4]:
stopW = stopwords.words('english')

exclude = set(string.punctuation)

stopW.extend(exclude)

In [5]:
toktok = word_tokenize(str(x))

tokens_without_stopW = [word for word in toktok if word not in stopW]

In [6]:
lemma = WordNetLemmatizer()

In [7]:
tokens = [lemma.lemmatize(lemma.lemmatize(lemma.lemmatize(word,pos='a'),pos='v'), pos='n')
          for word in tokens_without_stopW]

In [8]:
def lemmatize(sent):
    tokens = word_tokenize(sent.lower())
    tokens = [lemma.lemmatize(lemma.lemmatize(lemma.lemmatize(word, pos='a'), pos='v'),
                              pos='n') for word in tokens_without_stopW]
    return ''.join(tokens)

In [9]:
x1 = []
bigrams = ngrams(tokens_without_stopW, 2)
for words in bigrams:
    x1.append(words)

## Classification 

In [9]:
vectorizer = CountVectorizer(stop_words=stopW, analyzer='word', ngram_range=(1,2))

X = vectorizer.fit_transform(x)

features_names = vectorizer.get_feature_names()
pd.DataFrame(X.toarray(), columns = features_names)

## Pipeline 

In [None]:
df = pd.read_csv("Emotion_final.csv", names=['content','sentiment'], skiprows=1)
df1 = df1 = pd.read_csv('text_emotion.csv', usecols=['sentiment','content'])

In [None]:
corpus = np.array(df.content)
targets = np.array(df.sentiment)

In [None]:
stopwords = nltk.corpus.stopwords.words("english")

In [None]:
# Vobabulary analysis
vec = CountVectorizer(stop_words=stopwords)
X = vec.fit_transform(corpus)
words = vec.get_feature_names()

print("vocabulary size; %d" % len(words) )

# Compute rank
wsum = np.array(X.sum(0))[0]
ix = wsum.argsort()[::-1]
wrank = wsum[ix] 
labels = [words[i] for i in ix]

# Sub-sample the data to plot.
# take the 20 first + the rest sample with the given step 
def subsample(x, step=150):
    return np.hstack((x[:30], x[10::step]))


plt.figure(figsize=(20,8))
freq = subsample(wrank)
r = np.arange(len(freq))
plt.bar(r, freq, width=0.7)
plt.xticks(r, subsample(labels), rotation=55)
plt.xlabel('word rank')
plt.ylabel('word frequncy')
plt.title("Words ordered by rank. The first rank is the most frequent words and the last one is the less present");

In [None]:
pipe1 = Pipeline([
    ('vect', CountVectorizer()),
    ('sgd', SGDClassifier()),
])

pipe2 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('sgd', SGDClassifier()),
])
pipe3 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('svml', LinearSVC()),
])

pipe4 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('logit', LogisticRegression()),
])

pipe5 = Pipeline([
    ('vect', CountVectorizer()),
    ('compl_nb', ComplementNB()),
])


def run_pipes(pipes, splits=10, test_size=0.2, seed=42):  
    res = defaultdict(list)
    spliter = ShuffleSplit(n_splits=splits, test_size=test_size, random_state=seed)
    for idx_train, idx_test in spliter.split(corpus):
        for pipe in pipes:
            # name of the model
            name = "-".join([x[0] for x in pipe.steps])
            
            # extract datasets
            X_train = corpus[idx_train]
            X_test = corpus[idx_test]
            y_train = targets[idx_train]
            y_test = targets[idx_test]
            
            # Learn
            start = time.time()
            pipe.fit(X_train, y_train)
            fit_time = time.time() - start
            
            # predict and save results
            y = pipe.predict(X_test)
            res[name].append([
                fit_time,
                f1_score(y_test,y, average='macro'),
                precision_score(y_test,y, average='macro'),
                recall_score(y_test,y, average='macro'),
                


            ])
    return res

def print_table(res):
       # Compute mean and std
       final = {}     
       for model in res:         
         arr = np.array(res[model])         
         final[model] = {             
             "time" : arr[:, 0].mean().round(2),             
             "f1_score": [arr[:,2].mean().round(3), arr[:,2].std().round(3)],             
             "Precision" : arr[:,2].mean().round(3),             
             "Recall" : arr[:,2].mean().round(3)         }
       df = pd.DataFrame.from_dict(final, orient="index").round(3)
       return df



In [None]:
res = run_pipes([pipe1, pipe2, pipe3, pipe4, pipe5], splits=1)
print_table(res)

## Concatenation of data

In [None]:
final = pd.concat([df, df1], ignore_index=True)
final

In [None]:
corpus = np.array(final.content)
targets = np.array(final.sentiment)

In [None]:
pipe1 = Pipeline([
    ('vect', CountVectorizer()),
    ('sgd', SGDClassifier()),
])

pipe2 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('sgd', SGDClassifier()),
])
pipe3 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('svml', LinearSVC()),
])

pipe4 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('logit', LogisticRegression()),
])

pipe5 = Pipeline([
    ('vect', CountVectorizer()),
    ('compl_nb', ComplementNB()),
])


def run_pipes(pipes, splits=10, test_size=0.2, seed=42):  
    res = defaultdict(list)
    spliter = ShuffleSplit(n_splits=splits, test_size=test_size, random_state=seed)
    for idx_train, idx_test in spliter.split(corpus):
        for pipe in pipes:
            # name of the model
            name = "-".join([x[0] for x in pipe.steps])
            
            # extract datasets
            X_train = corpus[idx_train]
            X_test = corpus[idx_test]
            y_train = targets[idx_train]
            y_test = targets[idx_test]
            
            # Learn
            start = time.time()
            pipe.fit(X_train, y_train)
            fit_time = time.time() - start
            
            # predict and save results
            y = pipe.predict(X_test)
            res[name].append([
                fit_time,
                f1_score(y_test,y, average='macro'),
                precision_score(y_test,y, average='macro'),
                recall_score(y_test,y, average='macro'),
                


            ])
    return res

def print_table(res):
       # Compute mean and std
       final = {}     
       for model in res:         
         arr = np.array(res[model])         
         final[model] = {             
             "time" : arr[:, 0].mean().round(2),             
             "f1_score": [arr[:,2].mean().round(3), arr[:,2].std().round(3)],             
             "Precision" : arr[:,2].mean().round(3),             
             "Recall" : arr[:,2].mean().round(3)         }
       df = pd.DataFrame.from_dict(final, orient="index").round(3)
       return df

In [None]:
res = run_pipes([pipe1, pipe2, pipe3, pipe4, pipe5], splits=1)
print_table(res)

In [None]:
#transformer = LabelEncoder()
#a = transformer.fit_transform(corpus)

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(targets, a, test_size=0.33, random_state=42)

#clf = LinearSVC()
#clf.fit(X_train, y_train)
#y_pred = clf.predic(X_test)
#print('F1 score :', f1_score(y_test, y_pred))