In [1]:
import pandas as pd
import functions as func
import numpy as np
import seaborn as sns

In [2]:
# load data sets
db_locations = func.import_yaml()

#load df
df = pd.read_csv(db_locations['data_clean']['train'])
df_test = pd.read_csv(db_locations['data_clean']['test'])


In [3]:
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [4]:
df_test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [5]:
#Convert the df using th tokenization and removal of stopwords 
df['cleaned_text'] = df['Phrase'].apply(func.convert_token)
df_test['cleaned_text'] = df_test['Phrase'].apply(func.convert_token)

In [6]:
#saving the new data set because it was a slow process and it took time
df.to_csv(db_locations['data_clean']['spacy_train'],index = False)
df_test.to_csv(db_locations['data_clean']['spacy_test'],index = False)


In [7]:
# load the df after, so we dont have to wait for the convertion

df = pd.read_csv(db_locations['data_clean']['spacy_train'])
df_test = pd.read_csv(db_locations['data_clean']['spacy_test'])

In [9]:
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,cleaned_text
0,1,1,A series of escapades demonstrating the adage ...,1,series escapade demonstrate adage good goose g...
1,2,1,A series of escapades demonstrating the adage ...,2,series escapade demonstrate adage good goose
2,3,1,A series,2,series
3,4,1,A,2,
4,5,1,series,2,series


In [12]:
df.isnull().value_counts()

PhraseId  SentenceId  Phrase  Sentiment  cleaned_text
False     False       False   False      False           153384
                                         True              2676
Name: count, dtype: int64

In [10]:
df_test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,cleaned_text
0,156061,8545,An intermittently pleasing but mostly routine ...,intermittently pleasing routine effort
1,156062,8545,An intermittently pleasing but mostly routine ...,intermittently pleasing routine effort
2,156063,8545,An,
3,156064,8545,intermittently pleasing but mostly routine effort,intermittently pleasing routine effort
4,156065,8545,intermittently pleasing but mostly routine,intermittently pleasing routine


In [19]:
df[df.cleaned_text.isnull()]

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,cleaned_text
3,4,1,A,2,
6,7,1,of,2,
13,14,1,the,2,
16,17,1,that,2,
18,19,1,what,2,
...,...,...,...,...,...
155587,155588,8515,a few others,2,
155588,155589,8515,few others,2,
155591,155592,8515,do n't make often enough,2,
155592,155593,8515,make often enough,2,


In [26]:
df[df.cleaned_text.isnull()& df.Sentiment == 1].sort_values('Phrase').duplicated().sum()

0

In [13]:
df_test.isnull().value_counts()

PhraseId  SentenceId  Phrase  cleaned_text
False     False       False   False           64829
                              True             1462
                      True    True                1
Name: count, dtype: int64

In [18]:
df_test[df_test.cleaned_text.isnull()]

Unnamed: 0,PhraseId,SentenceId,Phrase,cleaned_text
2,156063,8545,An,
9,156070,8545,but,
11,156072,8545,mostly,
14,156075,8545,.,
19,156080,8546,is really,
...,...,...,...,...
66126,222187,11844,not quite enough,
66127,222188,11844,quite enough,
66216,222277,11850,-- but,
66248,222309,11852,even if you,


In [30]:
#droping nan columns for the first test

df.dropna(inplace = True)
df_test.dropna(inplace=True)

In [31]:
# data preparation with TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000) 

X_train = tfidf.fit_transform(df['cleaned_text']).toarray()
X_test = tfidf.transform(df_test['cleaned_text']).toarray()

# Target variable (Sentiment)
y_train = df['Sentiment']

In [32]:
#testing what we learned in class
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Split the data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_split, y_train_split)

# Make predictions on the validation set
y_pred = model.predict(X_val_split)

# Evaluate the model
accuracy = accuracy_score(y_val_split, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Print the classification report
print(classification_report(y_val_split, y_pred))

Validation Accuracy: 0.614858036965805
              precision    recall  f1-score   support

           0       0.56      0.18      0.27      1434
           1       0.51      0.33      0.40      5435
           2       0.66      0.87      0.75     15537
           3       0.53      0.45      0.49      6470
           4       0.62      0.23      0.33      1801

    accuracy                           0.61     30677
   macro avg       0.58      0.41      0.45     30677
weighted avg       0.60      0.61      0.58     30677



In [6]:
# for the test data set


