In [None]:
import pandas as pd
import numpy as np
import re
import string
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import joblib as jb
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from xgboost import XGBClassifier

In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df = train_df[['text', 'sentiment']]
test_df = test_df[['text', 'sentiment']]

In [None]:
train_df.drop(train_df[(train_df['sentiment']=='neutral')].index, inplace = True)
test_df.drop(test_df[(test_df['sentiment']=='neutral')].index, inplace = True)

train_df = train_df[train_df['text'].notna()]
test_df = test_df[test_df['text'].notna()]

In [None]:
stop_words = stopwords.words('english')

def remove_all_punct(text):
    # print(text)
    table = str.maketrans('','',string.punctuation)
    return text.translate(table)

# Remove numbers, replace it by NUMBER
def remove_number(text):
    num = re.compile(r'[-+]?[.\d]*[\d]+[:,.\d]*')
    return num.sub(r'NUMBER', text)

# -----------------------------------------------------------------------
# text_preprocess :
# Makes text lower, removes all punctuation, removes number and replaces
# it with string "NUMBER", tokenizes the text and then removes stop words.
# -----------------------------------------------------------------------
def text_preprocess(text):
    # porter = PorterStemmer()
    text = remove_all_punct(text)
    text = remove_number(text)
    text = text.lower()
    # text = porter.stem(text)
    return text

def text_stemmer(text):
    stemmer = PorterStemmer()
    text = ' '.join(stemmer.stem(token) for token in word_tokenize(text))
    return text

def text_tokenize(text):
    text = word_tokenize(text)
    text = [word for word in text if word not in stop_words]
    return text

def text_tokenize_with_stopwords(text):
    text = word_tokenize(text)
    text = [word for word in text]
    return text

In [None]:
train_df['preprocessed_text'] = train_df['text'].apply(lambda x : text_preprocess(x))
test_df['preprocessed_text'] = test_df['text'].apply(lambda x : text_preprocess(x))
                                                    
train_df['stemmed_text'] = train_df["preprocessed_text"].apply(lambda x : text_stemmer(x))
test_df['stemmed_text'] = test_df["preprocessed_text"].apply(lambda x : text_stemmer(x))

train_df['tokens'] = train_df['preprocessed_text'].apply(lambda x : text_tokenize(x))
test_df['tokens'] = test_df['preprocessed_text'].apply(lambda x : text_tokenize(x))

train_df['tokens_with_sw'] = train_df['preprocessed_text'].apply(lambda x : text_tokenize_with_stopwords(x))
test_df['tokens_with_sw'] = test_df['preprocessed_text'].apply(lambda x : text_tokenize_with_stopwords(x))

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder =LabelEncoder()
train_df['sentiment_encoded']= label_encoder.fit_transform(train_df['sentiment'])
test_df['sentiment_encoded']= label_encoder.fit_transform(test_df['sentiment'])

In [None]:
print(label_encoder.classes_)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
MODEL_FILENAME = 'sentiment_model.pkl'
VECT_FILENAME = 'sentiment_vectorizer.pkl'
def count_vector(data):
    count_vectorizer = CountVectorizer()
    count_vectorizer.fit(data.values.astype('U'))
    vect = count_vectorizer.transform(data.values.astype('U'))
    jb.dump(count_vectorizer, VECT_FILENAME)
    return vect, count_vectorizer

def tfidf_vector(data):
    tfidf_vectorizer = TfidfVectorizer()
    vect = tfidf_vectorizer.fit_transform(data.values.astype('U'))
    return vect, tfidf_vectorizer

X_train_count, count_vectorizer = count_vector(train_df["stemmed_text"])
X_train_tfidf, tfidf_vectorizer = tfidf_vector(train_df["stemmed_text"])

X_test_count = count_vectorizer.transform(test_df["stemmed_text"])
X_test_tfidf = tfidf_vectorizer.transform(test_df["stemmed_text"])

In [None]:

random_state = 42

models=[
        # XGBClassifier(max_depth=6, n_estimators=500),
        # SVC(random_state=random_state, kernel='linear'),
        LogisticRegression(solver = 'sag', random_state=random_state),
        # RandomForestClassifier(n_estimators=500,random_state=random_state),
        # MultinomialNB(),
        # DecisionTreeClassifier(random_state = random_state),
        # KNeighborsClassifier(),
       ]

In [None]:
metric = []
# CV = 5
# cv_df = pd.DataFrame(index=range(CV * len(models)))
# entries = []
def fit_and_predict(model,x_train,x_test,y_train,y_test,vectorizer):
    classifier = model
    classifier_name = str(classifier.__class__.__name__)
    eval_set = [(x_test, y_test)]
    classifier.fit(x_train,y_train)
    y_pred = classifier.predict(x_test)
    if(classifier_name=='LogisticRegression' and str(vectorizer)=='Count vector'):
        jb.dump(classifier, MODEL_FILENAME)
    # accuracies  = cross_val_score(model, x, y, scoring='accuracy', cv=CV)
    # for fold_idx, accuracy in enumerate(accuracies):
    #         entries.append((str(classifier.__class__.__name__),str(vectorizer), fold_idx, accuracy))

    f1score = f1_score(y_test,y_pred,average='weighted')
    train_accuracy = round(classifier.score(x_train,y_train)*100)
    test_accuracy =  round(accuracy_score(y_test,y_pred)*100)
    
    
    metric.append({
        "model": classifier_name,
        "f1 score": f1score, 
        "train accuracy": train_accuracy, 
        "test accuracy": test_accuracy, 
        "vectorizer": str(vectorizer),
        })

    print(str(classifier.__class__.__name__) +" using "+ str(vectorizer))
    print(classification_report(y_test,y_pred))    
    # print('Accuracy over splitted train and test set')
    print('Accuracy of classifier on training set:{}%'.format(train_accuracy))
    print('Accuracy of classifier on test set:{}%' .format(test_accuracy))

In [None]:
# for model in models:
#     y_train = train_df.sentiment_encoded
#     y_test = test_df.sentiment_encoded
#     # x = X_train_count
#     # x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2)
#     fit_and_predict(model,X_train_count,X_test_count,y_train,y_test,'Count vector')
    
#     # x = X_train_tfidf
#     # # x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2)
#     # fit_and_predict(model,X_train_tfidf,X_test_tfidf,y_train,y_test, 'Tfidf vector')


In [None]:
# metric_df = pd.DataFrame(metric)
# metric_df = metric_df.sort_values('f1 score', ascending=False)
# metric_df.to_csv('model_metrics.csv', index=False)

In [None]:
MODEL_FILENAME = '/Applications/Github/Negotiation-Bot/sentimentAnalysis/sentiment_model.pkl'
VECT_FILENAME = '/Users/kabirbatra/Downloads/sentimentAnalysis/sentiment_vectorizer.pkl'
def load_model_and_vectorizer():
    # if os.path.exists(MODEL_FILENAME):
    model = jb.load(MODEL_FILENAME)
    vect = jb.load(VECT_FILENAME)
    return model, vect

model, vect = load_model_and_vectorizer()
def predict_intent(text):
    preprocessed_text = text_preprocess(text)
    vectorized_text = vect.transform([preprocessed_text])
    # predicting the intent
    label = model.predict(vectorized_text)
    print(label_encoder.classes_[label])

In [None]:
predicted_intent = predict_intent("very old product")