## Lab 2
**Author :** ***Ahmed Samady***\
**Supervised by :** ***Pr. Lotfi El Aachak***\
**Course :** ***NLP***

### Imports

In [1]:
import pandas as pd
import numpy as np
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

## Part 1: Language Modeling - Regression

### Loading file into a DataFrame

In [2]:
data = pd.read_csv('data/answers.csv')
data

Unnamed: 0,id,answer,score,correct
0,1.1,High risk problems are address in the prototyp...,3.5,0.0
1,1.1,To simulate portions of the desired final prod...,5.0,1.0
2,1.1,A prototype program simulates the behaviors of...,4.0,1.0
3,1.1,Defined in the Specification phase a prototype...,5.0,1.0
4,1.1,It is used to let the users have a first idea ...,3.0,0.0
...,...,...,...,...
2437,12.1,log n,5.0,1.0
2438,12.1,minus 1 divided by 2,1.5,0.0
2439,12.1,2n-1,2.5,0.0
2440,12.1,"it takes at most h steps, where h is the heigh...",5.0,1.0


In [3]:
data['id'].value_counts()

id
11.1    60
12.1    56
3.7     31
3.6     31
3.5     31
        ..
10.4    24
10.3    24
10.2    24
10.1    24
10.6    24
Name: count, Length: 85, dtype: int64

### Choosing only the answers for a specific question

In [4]:
data = data[data['id'] == 11.1]

In [5]:
data['score'].describe()

count    60.000000
mean      4.408333
std       0.865985
min       1.500000
25%       4.000000
50%       5.000000
75%       5.000000
max       5.000000
Name: score, dtype: float64

### Removing punctuations and lowercasing text

In [6]:
data['answer'] = data['answer'].apply(lambda x: \
    x.translate(str.maketrans('', '', string.punctuation)).lower())
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['answer'] = data['answer'].apply(lambda x: \


Unnamed: 0,id,answer,score,correct
1862,11.1,the name of the class file the parameters it m...,3.0,0.0
1863,11.1,access specifiers and functions oftentimes a c...,3.0,0.0
1864,11.1,the elements typically included in a class def...,5.0,1.0
1865,11.1,class is user defined it contains members data...,4.0,1.0
1866,11.1,member functions and data members,5.0,1.0


### Tokenization and removing stopwords

In [7]:
data['tokenized'] = data['answer'].apply(lambda x: word_tokenize(x))
data['tokenized'] = data['tokenized'].apply(lambda x: \
    [word for word in x if word not in stopwords.words('english')])
pattern = re.compile(r'\w{1,2}')
data['tokenized'] = data['tokenized'].apply(lambda x: [word for word in x if len(word) >= 3])
data.drop_duplicates(subset=['tokenized'],inplace=True)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tokenized'] = data['answer'].apply(lambda x: word_tokenize(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tokenized'] = data['tokenized'].apply(lambda x: \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tokenized'] = data['tokenized'].apply(lambda x: [word for word in x if len

Unnamed: 0,id,answer,score,correct,tokenized
1862,11.1,the name of the class file the parameters it m...,3.0,0.0,"[name, class, file, parameters, must, take, pe..."
1863,11.1,access specifiers and functions oftentimes a c...,3.0,0.0,"[access, specifiers, functions, oftentimes, co..."
1864,11.1,the elements typically included in a class def...,5.0,1.0,"[elements, typically, included, class, definit..."
1865,11.1,class is user defined it contains members data...,4.0,1.0,"[class, user, defined, contains, members, data..."
1866,11.1,member functions and data members,5.0,1.0,"[member, functions, data, members]"


In [8]:
data['tokenized']

1862    [name, class, file, parameters, must, take, pe...
1863    [access, specifiers, functions, oftentimes, co...
1864    [elements, typically, included, class, definit...
1865    [class, user, defined, contains, members, data...
1866                   [member, functions, data, members]
1867    [functions, variables, used, object, defined, ...
1868    [data, members, functions, function, definitio...
1869                    [return, type, input, parameters]
1870                [constructor, data, members, methods]
1871    [function, members, member, attributes, words,...
1872    [elements, included, class, definition, access...
1873                    [variables, function, prototypes]
1874    [public, private, variables, initiations, func...
1875                     [return, value, accepted, value]
1876    [class, definition, typically, includes, class...
1877    [parameters, type, class, name, return, type, ...
1878         [data, members, class, variables, functions]
1879    [const

### Applying stemming

In [9]:
data['stemmed_tokens'] = data['tokenized'].apply(lambda x: \
    [PorterStemmer().stem(word) for word in x])
data['stemmed_tokens']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['stemmed_tokens'] = data['tokenized'].apply(lambda x: \


1862    [name, class, file, paramet, must, take, perfo...
1863    [access, specifi, function, oftentim, construc...
1864    [element, typic, includ, class, definit, funct...
1865    [class, user, defin, contain, member, data, fu...
1866                     [member, function, data, member]
1867       [function, variabl, use, object, defin, class]
1868    [data, member, function, function, definit, va...
1869                       [return, type, input, paramet]
1870                  [constructor, data, member, method]
1871    [function, member, member, attribut, word, cla...
1872    [element, includ, class, definit, accessmodifi...
1873                        [variabl, function, prototyp]
1874    [public, privat, variabl, initi, function, inc...
1875                         [return, valu, accept, valu]
1876    [class, definit, typic, includ, class, name, c...
1877    [paramet, type, class, name, return, type, cod...
1878             [data, member, class, variabl, function]
1879       [co

### Applying lemmatization

In [10]:
data['lemmatized_tokens'] = data['tokenized'].apply(lambda x: \
    [WordNetLemmatizer().lemmatize(word) for word in x])
data['lemmatized_tokens']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lemmatized_tokens'] = data['tokenized'].apply(lambda x: \


1862    [name, class, file, parameter, must, take, per...
1863    [access, specifier, function, oftentimes, cons...
1864    [element, typically, included, class, definiti...
1865    [class, user, defined, contains, member, data,...
1866                     [member, function, data, member]
1867    [function, variable, used, object, defined, cl...
1868    [data, member, function, function, definition,...
1869                     [return, type, input, parameter]
1870                  [constructor, data, member, method]
1871    [function, member, member, attribute, word, cl...
1872    [element, included, class, definition, accessm...
1873                      [variable, function, prototype]
1874    [public, private, variable, initiation, functi...
1875                     [return, value, accepted, value]
1876    [class, definition, typically, includes, class...
1877    [parameter, type, class, name, return, type, c...
1878            [data, member, class, variable, function]
1879     [cons

### Generating word embeddings using BagOfWords

In [11]:
data['lemmatized_text'] = data['lemmatized_tokens'].apply(lambda x: ' '.join(x))
vectorizer = CountVectorizer()
bof_model = vectorizer.fit(data['lemmatized_text'].tolist())
bof_vect = bof_model.transform(data['lemmatized_text'])
bow_df = pd.DataFrame(bof_vect.toarray(), columns=vectorizer.get_feature_names_out())
bow_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['lemmatized_text'] = data['lemmatized_tokens'].apply(lambda x: ' '.join(x))


Unnamed: 0,accepted,access,accessmodifier,according,address,algorithm,also,answer,array,arrayor,...,used,user,usually,value,variable,version,way,whole,word,work
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Generating word embeddings using TF-IDF

In [12]:
tfidf = TfidfVectorizer()
tfidf_model = tfidf.fit(data['lemmatized_text'])
tfidf_vect = tfidf_model.transform(data['lemmatized_text'])
tfidf_df = pd.DataFrame(tfidf_vect.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df.head()

Unnamed: 0,accepted,access,accessmodifier,according,address,algorithm,also,answer,array,arrayor,...,used,user,usually,value,variable,version,way,whole,word,work
0,0.0,0.0,0.0,0.0,0.0,0.0,0.306395,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.519186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.21989,0.0,0.521989,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.29481,0.349919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Generating word embeddings using Word2Vec 

In [13]:
w2v_model = Word2Vec(data['lemmatized_tokens'], vector_size=30, window=5, min_count=1, workers=4)


In [14]:
def vectorize_answer(answer_tokens, word2vec_model):
    answer_vector = np.mean([word2vec_model.wv.get_vector(word) for word in answer_tokens if word in word2vec_model.wv] \
        or [np.zeros(word2vec_model.vector_size)], axis=0)
    return answer_vector

data['w2v_embeddings'] = data['lemmatized_tokens'].apply(lambda x: vectorize_answer(x, w2v_model))
w2v_df = pd.DataFrame(data['w2v_embeddings'].values.tolist(), index=data.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['w2v_embeddings'] = data['lemmatized_tokens'].apply(lambda x: vectorize_answer(x, w2v_model))


In [15]:
X_train, X_test, y_train, y_test = train_test_split(w2v_df, data['score'], test_size=0.1, random_state=0)

In [16]:
svr = SVR(kernel='rbf', C=10000, gamma=0.1, tol=0.1)
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)
print('SVR MSE Score:', mean_squared_error(y_test, y_pred))
print('SVR RMSE Score:', root_mean_squared_error(y_test, y_pred))

SVR MSE Score: 0.3627122499809037
SVR RMSE Score: 0.6022559671608939


In [17]:
lr = LinearRegression(fit_intercept=True)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print('Linear Regression MSE Score:', mean_squared_error(y_test, y_pred))
print('Linear Regression RMSE Score:', root_mean_squared_error(y_test, y_pred))

Linear Regression MSE Score: 0.5040098460750949
Linear Regression RMSE Score: 0.7099365084816353


In [18]:
dtr = DecisionTreeRegressor(max_depth=5,random_state=0)
dtr.fit(X_train, y_train)
y_pred = dtr.predict(X_test)
print('Decision Tree Regressor MSE Score:', mean_squared_error(y_test, y_pred))
print('Decision Tree Regressor RMSE Score:', root_mean_squared_error(y_test, y_pred))

Decision Tree Regressor MSE Score: 0.9120370370370369
Decision Tree Regressor RMSE Score: 0.955006302092838


## Part 2: Language Modeling - Classification

### Loading training and validation datasets

In [19]:
tweets_train = pd.read_csv('data/twitter_training.csv')
tweets_val = pd.read_csv('data/twitter_validation.csv')
tweets_val.drop(['id','game'],axis=1,inplace=True)
tweets_train.drop(['id','game'],axis=1,inplace=True)

### cleaning tweets

In [20]:
# helper function to clean tweets
def processTweet(tweet):
    if isinstance(tweet, float):
        return str(tweet)
    # remove user handles tagged in the tweet
    tweet = re.sub('@[^\s]+','',tweet)
    # remove words that start with th dollar sign    
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*\/\w*', '', tweet)
    tweet = re.sub(r'(?:^|[\s,])([\w-]+\.[a-z]{2,}\S*)\b','',tweet)
    # remove hashtags
    tweet = re.sub(r'#\w*', '', tweet)
    # remove all kinds of punctuations and special characters
    punkt = string.punctuation + r'''`‘’)(+÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”،.”…“–ـ”.°ा'''
    tweet = tweet.translate(str.maketrans('', '', punkt))
    # remove words with 2 or fewer letters
    tweet = re.sub(r'\b\w{1,2}\b', '', tweet)
    # remove HTML special entities (e.g. &amp;)
    tweet = re.sub(r'\&\w*;', '', tweet)
    # remove whitespace (including new line characters)
    tweet = re.sub(r'\s\s+', ' ', tweet)
    # remove stopwords
    tweet = re.sub(r'\b('+ '|'.join(stopword for stopword in stopwords.words('english'))+ r')\b', '', tweet)
    # remove single space remaining at the front of the tweet.
    tweet = tweet.lstrip(' ')
    tweet = tweet.rstrip(' ')
    # remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    tweet = ''.join(c for c in tweet if c <= '\uffff')
    tweet = re.sub(r'([^\u1F600-\u1F6FF\s])','', tweet)
    # lowercase
    tweet = tweet.lower()
    # remove extra spaces
    tweet = re.sub(r'[\s]{2, }', ' ', tweet)
    
    return tweet

### Label encoding and dropping duplicates

In [21]:
tweets_train['clean_text'] = tweets_train['text'].apply(processTweet)
tweets_train['label'] = LabelEncoder().fit_transform(tweets_train['label'])
tweets_train.drop_duplicates(subset=['clean_text'],inplace=True)
tweets_train.drop(['text'],axis=1,inplace=True)
tweets_train.rename(columns={"clean_text": "text"},inplace=True)
tweets_train.dropna(inplace=True)
tweets_train.drop(tweets_train[tweets_train['text'] == ''].index, inplace = True)
tweets_train.drop(tweets_train[tweets_train['text'] == ' '].index, inplace = True)
tweets_train.drop(tweets_train[tweets_train['text'] == 'nan'].index, inplace = True)
tweets_train.head()

Unnamed: 0,label,text
0,3,getting borderlands murder
1,3,coming borders kill
2,3,getting borderlands kill
3,3,coming borderlands murder
5,3,getting borderlands murder


In [22]:
tweets_val['clean_text'] = tweets_val['text'].apply(processTweet)
tweets_val['label'] = LabelEncoder().fit_transform(tweets_val['label'])
tweets_val.drop_duplicates(subset=['clean_text'],inplace=True)
tweets_val.head()
tweets_val.drop(['text'],axis=1,inplace=True)
tweets_val.rename(columns={"clean_text": "text"},inplace=True)
tweets_val.dropna(inplace=True)
tweets_val.drop(tweets_val[tweets_val['text'] == ''].index, inplace = True)
tweets_val.drop(tweets_val[tweets_val['text'] == ' '].index, inplace = True)
tweets_val.drop(tweets_val[tweets_val['text'] == 'nan'].index, inplace = True)
tweets_val.head()

Unnamed: 0,label,text
0,0,mentioned facebook struggling motivation r...
1,2,bbc news amazon boss jeff bezos rejects claims...
2,1,why pay word functions poorly chromebook
3,1,csgo matchmaking full closet hacking truly aw...
4,2,now president slapping americans face reall...


In [23]:
tweets_train['tokenized'] = tweets_train['text'].apply(lambda x: word_tokenize(x))
tweets_val['tokenized'] = tweets_val['text'].apply(lambda x: word_tokenize(x))

In [24]:
tweets_train['stemmed_tokens'] = tweets_train['tokenized'].apply(lambda x: \
    [PorterStemmer().stem(word) for word in x])
tweets_val['stemmed_tokens'] = tweets_val['tokenized'].apply(lambda x: \
    [PorterStemmer().stem(word) for word in x])

In [25]:
tweets_train['lemmatized_tokens'] = tweets_train['tokenized'].apply(lambda x: \
    [WordNetLemmatizer().lemmatize(word) for word in x])
tweets_val['lemmatized_tokens'] = tweets_val['tokenized'].apply(lambda x: \
    [WordNetLemmatizer().lemmatize(word) for word in x])

In [26]:
tweets_train['lemmatized_text'] = tweets_train['lemmatized_tokens'].apply(lambda x: ' '.join(x))
vectorizer = CountVectorizer()
bof_model = vectorizer.fit(tweets_train['lemmatized_text'].tolist())
bof_vect = bof_model.transform(tweets_train['lemmatized_text'])
bow_df = pd.DataFrame(bof_vect.toarray(), columns=vectorizer.get_feature_names_out())
bow_df.head()

Unnamed: 0,000,00011,00014,00015,00015cant,00016,00054,00105,00107,0023,...,اللعبه,حبيت,خلاص,خلاصunk,عبر,فيديو,٩ᴗ۶,घरच,การออกอากาศของฉ,นจาก
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
tfidf = TfidfVectorizer()
tfidf_model = tfidf.fit(tweets_train['lemmatized_text'])
tfidf_vect = tfidf_model.transform(tweets_train['lemmatized_text'])
tfidf_df = pd.DataFrame(tfidf_vect.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df.head()

Unnamed: 0,000,00011,00014,00015,00015cant,00016,00054,00105,00107,0023,...,اللعبه,حبيت,خلاص,خلاصunk,عبر,فيديو,٩ᴗ۶,घरच,การออกอากาศของฉ,นจาก
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
w2v_model = Word2Vec(tweets_train['lemmatized_tokens'], vector_size=100, window=50, min_count=1, workers=4, epochs=100)

In [29]:
tweets_train['w2v_embeddings'] = tweets_train['lemmatized_tokens'].apply(lambda x: vectorize_answer(x, w2v_model))
w2v_df_train = pd.DataFrame(tweets_train['w2v_embeddings'].values.tolist(), index=tweets_train.index)

In [30]:
tweets_val['w2v_embeddings'] = tweets_val['lemmatized_tokens'].apply(lambda x: vectorize_answer(x, w2v_model))
w2v_df_val = pd.DataFrame(tweets_val['w2v_embeddings'].values.tolist(), index=tweets_val.index)

In [31]:
svc = SVC(kernel='rbf')
svc.fit(w2v_df_train, tweets_train['label'])
y_pred = svc.predict(w2v_df_val)

In [32]:
print(classification_report(tweets_val['label'], y_pred))

              precision    recall  f1-score   support

           0       0.87      0.84      0.86       172
           1       0.83      0.94      0.88       264
           2       0.93      0.77      0.84       276
           3       0.86      0.91      0.88       275

    accuracy                           0.87       987
   macro avg       0.87      0.86      0.86       987
weighted avg       0.87      0.87      0.87       987



In [33]:
print(f"accuracy: {accuracy_score(tweets_val['label'], y_pred)}")

accuracy: 0.8662613981762918


In [52]:
logreg = LogisticRegression(penalty='l2',solver='newton-cholesky',multi_class='ovr',tol=1e-6)
logreg.fit(w2v_df_train, tweets_train['label'])
y_pred = logreg.predict(w2v_df_val)



In [53]:
print("Logistic Regression: ")
print(classification_report(tweets_val['label'], y_pred))

Logistic Regression: 
              precision    recall  f1-score   support

           0       0.39      0.16      0.23       172
           1       0.51      0.73      0.60       264
           2       0.52      0.39      0.45       276
           3       0.54      0.66      0.59       275

    accuracy                           0.51       987
   macro avg       0.49      0.48      0.47       987
weighted avg       0.50      0.51      0.49       987



In [54]:
print(f"Logistic Regression accuracy: {accuracy_score(tweets_val['label'], y_pred)}")

Logistic Regression accuracy: 0.5146909827760892


In [55]:
nbm = GaussianNB()
nbm.fit(w2v_df_train, tweets_train['label'])
y_pred = nbm.predict(w2v_df_val)

In [56]:
print("Naive Bayes: ")
print(classification_report(tweets_val['label'], y_pred))

Naive Bayes: 
              precision    recall  f1-score   support

           0       0.32      0.31      0.31       172
           1       0.52      0.61      0.56       264
           2       0.46      0.38      0.42       276
           3       0.51      0.53      0.52       275

    accuracy                           0.47       987
   macro avg       0.45      0.46      0.45       987
weighted avg       0.47      0.47      0.47       987



In [57]:
print(f"Naive Bayes accuracy: {accuracy_score(tweets_val['label'], y_pred)}")

Naive Bayes accuracy: 0.47213779128672745


In [58]:
adab = AdaBoostClassifier()
adab.fit(w2v_df_train, tweets_train['label'])
y_pred = adab.predict(w2v_df_val)



In [59]:
print("AdaBoost: ")
print(classification_report(tweets_val['label'], y_pred))

AdaBoost: 
              precision    recall  f1-score   support

           0       0.31      0.20      0.25       172
           1       0.49      0.68      0.57       264
           2       0.49      0.33      0.39       276
           3       0.50      0.59      0.54       275

    accuracy                           0.48       987
   macro avg       0.45      0.45      0.44       987
weighted avg       0.46      0.48      0.46       987



In [60]:
print(f"AdaBoost accuracy: {accuracy_score(tweets_val['label'], y_pred)}")

AdaBoost accuracy: 0.475177304964539
