# Task3_Ossan_Noémie_Yosr

## Lecture du fichier csv

In [1]:
import pandas as pd
import tensorflow
import numpy as np

df = pd.read_csv('task3-train.csv', sep='\t', header=None, skiprows=10)
df1 = df.iloc[:,-2:] # get two lasts columns
df1.columns = ["text","sentiment"] # name columns


## Séparation du corpus (70/30)

In [2]:
#Séparation
from sklearn.model_selection import train_test_split

data_train, data_test, sentiment_train, sentiment_test = train_test_split(df1.text, df1.sentiment, test_size=0.3,random_state=109) # 70% training and 30% test


## Vectorization

In [3]:
from sklearn.feature_extraction.text import CountVectorizer


vectorizer = CountVectorizer()
train_vectors = vectorizer.fit_transform(data_train)
test_vectors = vectorizer.transform(data_test) # check difference
##il faut obtenir l'id du test, et la prediction

## Création du modèle à partir des données d'apprentissages

In [4]:
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import classification_report

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')

#Train Data vectors
classifier_linear.fit(train_vectors, sentiment_train)


#Prediction
prediction_linear = classifier_linear.predict(test_vectors)

report = classification_report(sentiment_test, prediction_linear, output_dict=True)
print('Accuracy: ', report['accuracy'])
print('Macro average: ', report['macro avg'])

print('<==\n FOR EACH CLASS ==>\n')

print('positive: ', report['positive'])
print('negative: ', report['negative'])
print('mixed: ', report['mixed'])
print('objective: ', report['objective'])




Accuracy:  0.6196721311475409
Macro average:  {'precision': 0.5411369037245206, 'recall': 0.5223835187433795, 'f1-score': 0.5303524234436784, 'support': 1525}
<==
 FOR EACH CLASS ==>

positive:  {'precision': 0.5700934579439252, 'recall': 0.4728682170542636, 'f1-score': 0.516949152542373, 'support': 129}
negative:  {'precision': 0.6597222222222222, 'recall': 0.6785714285714286, 'f1-score': 0.6690140845070424, 'support': 700}
mixed:  {'precision': 0.24242424242424243, 'recall': 0.21739130434782608, 'f1-score': 0.22922636103151864, 'support': 184}
objective:  {'precision': 0.6923076923076923, 'recall': 0.720703125, 'f1-score': 0.7062200956937799, 'support': 512}


## Création du modèle à partir du corpus complet

In [5]:
#Train on whole dataset
train_vectors = vectorizer.fit_transform(df1.text)
test_vector = vectorizer.fit_transform(df1.text)
classifier_linear.fit(train_vectors, df1.sentiment)


#Prediction
prediction_linear = classifier_linear.predict(test_vector)

report = classification_report(df1.sentiment, prediction_linear, output_dict=True)

print('Accuracy: ', report['accuracy'])
print('Macro average: ', report['macro avg'])

print('\n <==FOR EACH CLASS ==>\n')

print('positive: ', report['positive'])
print('negative: ', report['negative'])
print('mixed: ', report['mixed'])
print('objective: ', report['objective'])


Accuracy:  0.9994095650462508
Macro average:  {'precision': 0.9993084835979785, 'recall': 0.9990582876752612, 'f1-score': 0.9991832062726396, 'support': 5081}

 <==FOR EACH CLASS ==>

positive:  {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 498}
negative:  {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 2251}
mixed:  {'precision': 0.9984076433121019, 'recall': 0.9968203497615262, 'f1-score': 0.9976133651551312, 'support': 629}
objective:  {'precision': 0.9988262910798122, 'recall': 0.9994128009395185, 'f1-score': 0.9991194599354271, 'support': 1703}


## Run2 with TFIDF / SVM


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df = 5,max_df = 0.8,sublinear_tf = True,use_idf = True)
train_vectors = vectorizer.fit_transform(data_train)
test_vectors = vectorizer.transform(data_test) # check difference
##il faut obtenir l'id du test, et la prediction
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')

#Train Data vectors
classifier_linear.fit(train_vectors, sentiment_train)


#Prediction
prediction_linear = classifier_linear.predict(test_vectors)

report = classification_report(sentiment_test, prediction_linear, output_dict=True)
print('Accuracy: ', report['accuracy'])
print('Macro average: ', report['macro avg'])

print('<==\n FOR EACH CLASS ==>\n')

print('positive: ', report['positive'])
print('negative: ', report['negative'])
print('mixed: ', report['mixed'])
print('objective: ', report['objective'])

Accuracy:  0.6577049180327869
Macro average:  {'precision': 0.5732139470271697, 'recall': 0.49586950409113345, 'f1-score': 0.5033350769499593, 'support': 1525}
<==
 FOR EACH CLASS ==>

positive:  {'precision': 0.6363636363636364, 'recall': 0.3798449612403101, 'f1-score': 0.4757281553398058, 'support': 129}
negative:  {'precision': 0.64472190692395, 'recall': 0.8114285714285714, 'f1-score': 0.7185325743200506, 'support': 700}
mixed:  {'precision': 0.3055555555555556, 'recall': 0.059782608695652176, 'f1-score': 0.1, 'support': 184}
objective:  {'precision': 0.7062146892655368, 'recall': 0.732421875, 'f1-score': 0.7190795781399808, 'support': 512}


## Run3 Stop words (French dictionary) / with TFIDF / SVM

In [7]:
from nltk.corpus import stopwords
stop_words_list = stopwords.words('french')
print(stop_words_list)
vectorizer = TfidfVectorizer(min_df = 5,max_df = 0.8,sublinear_tf = True,use_idf = True, stop_words=stop_words_list)
train_vectors = vectorizer.fit_transform(data_train)
test_vectors = vectorizer.transform(data_test) # check difference
##il faut obtenir l'id du test, et la prediction
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')

#Train Data vectors
classifier_linear.fit(train_vectors, sentiment_train)


#Prediction
prediction_linear = classifier_linear.predict(test_vectors)

report = classification_report(sentiment_test, prediction_linear, output_dict=True)
print('Accuracy: ', report['accuracy'])
print('Macro average: ', report['macro avg'])

print('<==\n FOR EACH CLASS ==>\n')

print('positive: ', report['positive'])
print('negative: ', report['negative'])
print('mixed: ', report['mixed'])
print('objective: ', report['objective'])

['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions', 'fussiez', 'fussent', 'ayant', 'ayante', 'ayantes', 'ayants', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aur

### Tests avec Tweet non annotés

In [8]:
##Creation a predictioncsv file
def predict(inputFile,outputFile, model):
    ## Get DataFrame from csv file (column => id tweet)
    dfTest = pd.read_csv(inputFile, sep='\t', header=None, skiprows=9) ## Check rows to Skip
    dfTestTweet = dfTest.iloc[:,-1:] # get the last column => tweet
    dfTestTweet = dfTestTweet.iloc[:,0] # Turn into panda Series
    
    #Vecteur des ids numpy array str
    dfTestId = ((dfTest.iloc[:,0]).to_numpy()).astype(str)
    #Création du vecteur de données tests
    test_vectors = vectorizer.transform(dfTestTweet) 
    
    #Prediction
    prediction = model.predict(test_vectors)
    
    #Create file lines 
    for i in range(dfTestId.size):
        dfTestId[i]= "(" + dfTestId[i] + ")"
        
    matrix = np.column_stack((prediction,dfTestId))
    
    #Create the file
    np.savetxt(outputFile, matrix ,fmt='%s')


In [9]:
# MDP => Changer le nom du fichier csv et le classifier
predict('task1-test_extrait20.csv','resultat1.sc',classifier_linear)
predict('task2-test_extrait20.csv','resultat2.sc',classifier_linear)
predict('task3-test_extrait20.csv','resultat3.sc',classifier_linear)

