# NLP homework (Naive Bayes, Logistic Regression, Fasttext)

## loading data 

In [1]:
import warnings
warnings.filterwarnings('ignore')


In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
# download necessary stuff
import os
import spacy
try:
    spacy_en = spacy.load("en_core_web_sm")
except:
    os.system('python -m spacy download en_core_web_sm')
    spacy_en = spacy.load("en_core_web_sm")
    
import nltk
for package_name in ['wordnet', 'omw-1.4', 'stopwords', 'punkt']:
    nltk.download(package_name)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Loading "Emotion dataset"

In [4]:
import pandas as pd
from datasets import load_dataset

dataset = load_dataset("emotion")
class_mapping = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger', 
    4: 'fear', 
    5: 'surprise'
}

Downloading builder script:   0%|          | 0.00/3.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.78k [00:00<?, ?B/s]



Downloading and preparing dataset emotion/split to /root/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/592k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.9k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset emotion downloaded and prepared to /root/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
train_df = pd.DataFrame({'POST': dataset['train']['text'], 'LABEL': dataset['train']['label']})
val_df = pd.DataFrame({'POST': dataset['validation']['text'], 'LABEL': dataset['validation']['label']})
test_df = pd.DataFrame({'POST': dataset['test']['text'], 'LABEL': dataset['test']['label']})

In [6]:
train_df


Unnamed: 0,POST,LABEL
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,1
15998,i feel like this was such a rude comment and i...,3



#Preprocessing data

In [9]:
def lemmatize_spacy(text):
    text = spacy_en(text)
    lemmas = [token.lemma_ for token in text]
    return " ".join(lemmas)
def remove_stopwords(text, stopwords):
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text

In [21]:
stops_spacy = sorted(spacy.lang.en.stop_words.STOP_WORDS)
stops_spacy.remove("not")

In [12]:
remove_stopwords(lemmatize_spacy(train_df["POST"][0]),stops_spacy )


'I feel humiliate'

In [13]:
train_df['POST'] = train_df['POST'].map(lemmatize_spacy) 
train_df['POST'] = train_df['POST'].map(lambda x: remove_stopwords(x, stops_spacy)) 


In [26]:
train_df['POST']

0                                     I not feel humiliate
1                  I feel hopeless damn hopeful care awake
2                 I m grab minute post I feel greedy wrong
3               I feel nostalgic fireplace I know property
4                                           I feel grouchy
                               ...                        
15995       I brief time beanbag I anna I feel like I beat
15996    I turn I feel pathetic I wait table sub teachi...
15997                           I feel strong good overall
15998                  I feel like rude comment I m glad t
15999               I know lot I feel stupid I not portray
Name: POST, Length: 16000, dtype: object

In [None]:
stops_spacy = sorted(spacy.lang.en.stop_words.STOP_WORDS)
from nltk.corpus import stopwords
stops = sorted(stopwords.words("english"))


# Bayes 

# Train

In [27]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [29]:
nb = Pipeline([('countVec', CountVectorizer(lowercase=False, token_pattern='\w+', min_df=3)),
               ('clf', MultinomialNB()),])
nb.fit(train_df["POST"], train_df["LABEL"])


Pipeline(steps=[('countVec',
                 CountVectorizer(lowercase=False, min_df=3,
                                 token_pattern='\\w+')),
                ('clf', MultinomialNB())])

In [30]:
y_pred = nb.predict(val_df["POST"])
train_score =  round(accuracy_score(y_pred, val_df["LABEL"]), 3)
print(train_score)

0.826


In [31]:
nb = Pipeline([('countVec', CountVectorizer(lowercase=False, token_pattern='\w+', ngram_range=(1, 3), min_df=3)),
               ('clf', MultinomialNB()),])
nb.fit(train_df["POST"], train_df["LABEL"])
y_pred = nb.predict(val_df["POST"])
train_score =  round(accuracy_score(y_pred, val_df["LABEL"]), 3)
print(train_score)

0.829


In [32]:
nb = Pipeline([('tfidf', TfidfVectorizer(lowercase=False, token_pattern='\w+', ngram_range=(1, 2), 
                                         min_df=3)),
               ('clf', MultinomialNB()),])
nb.fit(train_df["POST"], train_df["LABEL"])
y_pred = nb.predict(val_df["POST"])
train_score =  round(accuracy_score(y_pred, val_df["LABEL"]), 3)
print(train_score)

0.749


# logistic

In [33]:
from sklearn.linear_model import LogisticRegression

In [34]:
logreg = Pipeline([('countVec', CountVectorizer(lowercase=False, token_pattern='\w+', min_df=3)),
                   ('clf', LogisticRegression(random_state=42, solver='liblinear')),])
logreg.fit(train_df["POST"], train_df["LABEL"])
y_pred = logreg.predict(val_df["POST"])
train_score =  round(accuracy_score(y_pred, val_df["LABEL"]), 3)
print(train_score)

0.851


In [35]:
logreg = Pipeline([('countVec', CountVectorizer(lowercase=False, token_pattern='\w+', ngram_range=(1, 2), 
                                                min_df=3)),
                   ('clf', LogisticRegression(random_state=42, solver='liblinear')),])

logreg.fit(train_df["POST"], train_df["LABEL"])
y_pred = logreg.predict(val_df["POST"])
train_score =  round(accuracy_score(y_pred, val_df["LABEL"]), 3)
print(train_score)

0.853


In [36]:
logreg = Pipeline([('tfidf', TfidfVectorizer(lowercase=False, token_pattern='\w+', ngram_range=(1, 2), 
                                             min_df=3)),
                   ('clf', LogisticRegression(random_state=42, solver='liblinear')),])

logreg.fit(train_df["POST"], train_df["LABEL"])
y_pred = logreg.predict(val_df["POST"])

train_score =  round(accuracy_score(y_pred, val_df["LABEL"]), 3)
print(train_score)

0.84


# Fasttext

In [40]:
from fasttext import train_supervised


In [43]:
from fasttext import train_supervised

def to_fasttext_format(data: list, labels: list, save_path: str=None):
    ft_data = []
    for d, l in zip(data, labels):
        ft_data.append("__label__{} {}".format(l, d))
    if save_path:
        np.savetxt(save_path, ft_data, fmt='%s')
    else:
        return ft_data
    
def train_fasttext(X_train, y_train, wordNgrams=1, minCount=1, ft_train_path="./untitled.txt", **kwargs):
    to_fasttext_format(X_train, y_train, save_path=ft_train_path)
    ft_model = train_supervised(ft_train_path, wordNgrams=wordNgrams, minCount=minCount, epoch=10, loss="softmax",  **kwargs)
    print(ft_model)
    train_preds = [i[0].split('_')[-1] for i in ft_model.predict(list(X_train))[0]]
    result = [int(item) for item in train_preds]
    train_score = accuracy_score(result, y_train)
    print("train_score = ",train_score)
    
    return ft_model, train_score

In [44]:
ft_model, train_score = train_fasttext(train_df["POST"], train_df["LABEL"])
train_score


<fasttext.FastText._FastText object at 0x7f78f114fdc0>
train_score =  0.9734375


0.9734375

In [46]:
preds = [i[0].split('_')[-1] for i in ft_model.predict(list(val_df["POST"]))[0]]
val_preds = [int(item) for item in preds]
val_score =accuracy_score(val_preds, val_df["LABEL"])
print(f'val accuracy {val_score}')

val accuracy 0.8615


In [48]:
ft_model, train_score = train_fasttext(train_df["POST"], train_df["LABEL"], wordNgrams=2)
preds = [i[0].split('_')[-1] for i in ft_model.predict(list(val_df["POST"]))[0]]
val_preds = [int(item) for item in preds]

val_score = accuracy_score(val_preds,val_df["LABEL"])

print(f'val accuracy {val_score}')

<fasttext.FastText._FastText object at 0x7f78f114f430>
train_score =  0.991625
val accuracy 0.8385


In [50]:
ft_model, train_score = train_fasttext(train_df["POST"], train_df["LABEL"], minn=4, maxn=5)

preds = [i[0].split('_')[-1] for i in ft_model.predict(list(val_df["POST"]))[0]]
val_preds = [int(item) for item in preds]

val_score = accuracy_score(val_preds,val_df["LABEL"])

print(f'val accuracy {val_score}')

<fasttext.FastText._FastText object at 0x7f78f11b3070>
train_score =  0.916375
val accuracy 0.8415
