In [None]:
# imports
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.feature_extraction import _stop_words as stop_words
from sklearn.model_selection import train_test_split, cross_validate
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, roc_curve, auc, confusion_matrix
import matplotlib.pyplot as plt
from gensim.utils import tokenize
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import gensim.downloader
from sklearn.preprocessing import StandardScaler

In [None]:
# x_train_sample is text 
# count vectorizer is the most simple thing we can do. You can read the documentation on Sklearn

vectorizer = CountVectorizer(min_df=0, 
                             lowercase=True)
vectorizer.fit(x_train_sample)

In [None]:
# here's a simple classifier, we can try different ones
lg = LogisticRegression()
lg.fit(vectorizer.transform(x_train_sample).toarray(), y_train_sample)
print(classification_report(y_test_sample, lg.predict(vectorizer.transform(x_test_sample))))

In [None]:
# Here are three examples of different vectorizers, including Tf-Idf
# Three vectorizers
hash_vectorizer = HashingVectorizer(lowercase = True, 
                                    n_features = 3513)
hash_x_train_full = hash_vectorizer.fit_transform(x_train_full)
hash_x_test_full = hash_vectorizer.transform(x_test_full)
lg = LogisticRegression()
lg.fit(hash_x_train_full, y_train_full)
print('Hash vectorizer accuracy:',round(accuracy_score(y_test_full,lg.predict(hash_x_test_full)),4))
print('Hash vectorizer f1:',round(f1_score(y_test_full,lg.predict(hash_x_test_full)),4))
print("\n")
del lg

tfidf_vectorizer = TfidfVectorizer(lowercase=True, 
                                   max_features=1000, 
                                   min_df=0, 
                                   ngram_range=(1,3), 
                                   stop_words = "english")
tdidf_x_train_full = tfidf_vectorizer.fit_transform(x_train_full)
tdidf_x_test_full = tfidf_vectorizer.transform(x_test_full)
lg = LogisticRegression()
lg.fit(tdidf_x_train_full, y_train_full)
print('Tfidf vectorizer accuracy:',round(accuracy_score(y_test_full,lg.predict(tdidf_x_test_full)),4))
print('Tfidf vectorizer f1:',round(f1_score(y_test_full,lg.predict(tdidf_x_test_full)),4))
print("\n")
del lg

count_vectorizer = CountVectorizer(min_df=0, 
                                   lowercase=True)
count_x_train_full = count_vectorizer.fit_transform(x_train_full)
count_x_test_full = count_vectorizer.transform(x_test_full)
lg = LogisticRegression(max_iter=1000)
lg.fit(count_x_train_full, y_train_full)
print('Count vectorizer accuracy:',round(accuracy_score(y_test_full,lg.predict(count_x_test_full)),4))
print('Count vectorizer f1:',round(f1_score(y_test_full,lg.predict(count_x_test_full)),4))
del lg



In [None]:
# Here's Glove
# example
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-100')

In [None]:
# And this is the dumbest way of using it
# returns a word embedding representation of the review as the mean of the embeddings of each word after processing
def embbed_review(text, vectors):
    final_vector = np.zeros(vectors.vector_size)
    count = 0
    for token in list(tokenize(text)):
        try:
            final_vector += vectors[token]
            count += 1
        except:
            print(token)
            pass
    if count == 0:
        return np.zeros(vectors.vector_size) + np.nan
    
    final_vector /= count
    return final_vector    

In [None]:
print('Original text:',x_test_full[0])
print("\n")
print('Clean text:',x_test_full_clean[0])
print("\n")
embbed_review(x_test_full_clean[0], glove_vectors)