In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

In [None]:
train_df = pd.read_csv("train.csv", header = 0)
print(train_df.isna().sum())
print(train_df.shape)

In [None]:
train_df = train_df.fillna(' ') # fill nans with a blank
train_df['text'] = train_df['text'].str.strip() # strip them
# count the length of each text
train_df['text_length'] = train_df['text'].apply(lambda x: len(x)) 
print("There are {} rows with text length 0.".format(len(train_df[train_df['text_length'] == 0])))
# get text with text length greater than 0
train_df = train_df[train_df['text_length'] > 0]
print(train_df.shape)

In [None]:
def text_processing(x):
    # to perserve only Latin characters, digits and spaces
    cleaned_text = re.sub(r'[^a-zA-Z\d\s\']+', '', x)
    # tokenize (split) words
    words = nltk.word_tokenize(cleaned_text)
    # Create base word for each word
    words_list = [lemmatizer.lemmatize(w.lower()) for w in words if w not in stopwords]
    return " ".join(words_list)
    

In [None]:
train_df['base_text'] = train_df['text'].apply(lambda x: text_processing(x))
print("Done")

In [None]:
print(train_df['text'][1][:100])
print("-------------------")
print(train_df['base_text'][1][:100])

In [None]:
# Get the target
label = train_df['label'].values
# An n-gram is just a string of n words in a row. E.g the sentence 'Python is cool' contains the 2-grams 'Python is' and 'is cool'.
# And the final result if we use ngram_range = (1, 2) is ["Python", "is", "cool", "Python is", "is cool"]
count_vectorizer = CountVectorizer(ngram_range = (1, 2)) # Initialize CountVectorizer with ngram_range = (1, 2)
tfidf_transformer = TfidfTransformer(smooth_idf = False) # Initialze TfidfTransformer
# fit and transform train data to count_vectorizer
count_vect_train = count_vectorizer.fit_transform(train_df['base_text'].values)
# fit and transform count_vect_train to tfidf_transformer
tfidf_train = tfidf_transformer.fit_transform(count_vect_train)
# Train test split
x_train, x_test, y_train, y_test = train_test_split(tfidf_train, label, test_size = 0.2, random_state = 0)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

In [None]:
# Initialize Passive Aggressive Classifier
pac = PassiveAggressiveClassifier(max_iter = 50)
pac.fit(x_train, y_train)
y_pred = pac.predict(x_test)
print("Accuracy score: ", accuracy_score(y_pred, y_test))
print("Precision score: ", precision_score(y_pred, y_test))
print("F1 score: ", f1_score(y_pred, y_test))
print("Recall score: ", recall_score(y_pred, y_test))

In [None]:
# saving model
# You need to save models in app/models folder, or just copy/cut to app/models folder. It's important.
pickle.dump(count_vectorizer, open('count_vectorizer.pkl', 'wb'))
pickle.dump(tfidf_transformer, open('tfidf_transformer.pkl', 'wb'))
pickle.dump(pac, open('pac.pkl', 'wb'))

In [None]:
# loading model
count_vectorizer = pickle.load(open('count_vectorizer.pkl', 'rb'))
tfidf_transformer = pickle.load(open('tfidf_transformer.pkl', 'rb'))
pac = pickle.load(open('pac.pkl', 'rb'))