In [69]:
from help_functions import download_and_unzip
import pandas as pd

download_and_unzip('https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip', 'getting_started.zip')

Beginning file download...


In [70]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

#shuffle data
train_df_shuffled = train_data.sample(frac=1, random_state=42)

In [71]:
from sklearn.model_selection import train_test_split

features_train, features_test, labels_train, labels_test = train_test_split(train_df_shuffled['text'].tolist(), 
                                                                            train_df_shuffled.target.tolist(), 
                                                                            test_size=0.2, 
                                                                            random_state=42)

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
features_train = vectorizer.fit_transform(features_train)
features_test = vectorizer.transform(features_test)

In [73]:
features_train.shape, features_test.shape

((6090, 18454), (1523, 18454))

In [74]:
from sklearn.feature_selection import SelectPercentile, f_classif

perc = SelectPercentile(f_classif, percentile=10)
features_train_vec = perc.fit_transform(features_train, labels_train).toarray()

features_test_vec =perc.transform(features_test).toarray()



In [75]:
features_test_vec.shape, features_train_vec.shape

((1523, 1846), (6090, 1846))

In [76]:
from sklearn.naive_bayes import GaussianNB

ClassifierNB = GaussianNB()

ClassifierNB.fit(features_train_vec, labels_train)

pred = ClassifierNB.predict(features_test_vec).tolist()


In [77]:
score_test = ClassifierNB.score(features_test_vec, labels_test)

In [78]:
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, recall_score, f1_score

def matrices_calc(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    (tn, fp, fn, tp) = confusion_matrix(y_true, y_pred).ravel()
    print(f'accuracy: {accuracy:.4f}\nPrecision: {precision:.4f}\nRecall: {recall}\nF1-Score: {f1:.4f}\n(tn, fp, fn, tp): {(tn, fp, fn, tp)}')
    return accuracy, precision, recall, f1, (tn, fp, fn, tp)

matrices_calc(labels_test, pred)


accuracy: 0.7262
Precision: 0.8010
Recall: 0.49624060150375937
F1-Score: 0.6128
(tn, fp, fn, tp): (776, 82, 335, 330)


(0.7261982928430729,
 0.8009708737864077,
 0.49624060150375937,
 0.6128133704735376,
 (776, 82, 335, 330))

In [79]:
from sklearn.naive_bayes import MultinomialNB

ClassifierMNB = MultinomialNB()

ClassifierMNB.fit(features_train_vec, labels_train)

pred = ClassifierMNB.predict(features_test_vec).tolist()

In [80]:
score_test = ClassifierMNB.score(features_test_vec, labels_test)

In [81]:
matrices_calc(labels_test, pred)


accuracy: 0.7827
Precision: 0.8538
Recall: 0.606015037593985
F1-Score: 0.7089
(tn, fp, fn, tp): (789, 69, 262, 403)


(0.7826657912015759,
 0.8538135593220338,
 0.606015037593985,
 0.7088830255057168,
 (789, 69, 262, 403))

In [82]:
from sklearn.pipeline import Pipeline

features_train, features_test, labels_train, labels_test = train_test_split(train_df_shuffled['text'].tolist(), 
                                                                            train_df_shuffled.target.tolist(), 
                                                                            test_size=0.2, 
                                                                            random_state=42)

model_0 = Pipeline([
('tfidf', TfidfVectorizer()),
('clf', MultinomialNB())
])


model_0.fit(X = features_train, y = labels_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])

In [83]:
pred = model_0.predict(features_test).tolist()

In [103]:
metrics = matrices_calc(labels_test, pred)
metrics


accuracy: 0.7991
Precision: 0.8827
Recall: 0.6225563909774436
F1-Score: 0.7302
(tn, fp, fn, tp): (803, 55, 251, 414)


(0.799080761654629,
 0.8827292110874201,
 0.6225563909774436,
 0.7301587301587301,
 (803, 55, 251, 414))

In [85]:
score_test = model_0.score(features_test, labels_test)
score_test


0.799080761654629