In [1]:
import nltk
import gensim
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import models
from gensim.models.doc2vec import Doc2Vec
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import pandas as pd
import matplotlib.pyplot as plt
import pickle 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

def pre_process(text):
    tokenizer=RegexpTokenizer(r'\w+')  #tokenization
    tokenized_word = tokenizer.tokenize(text)  
    
    stop_words=set(stopwords.words("english"))  
    filtered_sent=[]
    for w in tokenized_word:  #removing stopwords
        if w not in stop_words:
            filtered_sent.append(w.lower())
            
    ps=PorterStemmer()
    stemmed_words=[]
    for w in filtered_sent:  #stemming
        stemmed_words.append(ps.stem(w))
        
    lemm_words=[]
    lem = WordNetLemmatizer()
    for w in stemmed_words:
        lemm_words.append(lem.lemmatize(w))
    
    return lemm_words


In [2]:
orig = pd.read_csv('Dataset/train.tsv',sep='\t')

label = orig.iloc[:,-1]
vectorizer = TfidfVectorizer(ngram_range=(1,2),tokenizer=pre_process)
vectors = vectorizer.fit_transform(orig['Phrase'])
pickle.dump(vectorizer, open("Pickles/vectorizer.pickle", "wb"))
X_train, X_test, y_train, y_test = train_test_split(vectors, label, test_size=0.33, random_state=42)
#clf =MultinomialNB()
clf=RandomForestClassifier()
model = clf.fit(X_train,y_train)

filename = 'Pickles/finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))
loaded_model = pickle.load(open(filename, 'rb'))

predicted= loaded_model.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(predicted,y_test))    
    


KeyboardInterrupt: 

In [None]:
import seaborn as sns
loaded_model = pickle.load(open(filename, 'rb'))
orig = pd.read_csv('Dataset/train.tsv',sep='\t')
predicted= loaded_model.predict(X_test)
conf_matrix = confusion_matrix(y_test, predicted)


In [None]:
demo = ["positive good best happy happy"]
df1 = pd.DataFrame(demo,columns=['Phrase'])
filename = 'finalized_model.sav'
vectorizer = pickle.load(open("Pickles/vectorizer.pickle", 'rb')) 
vectors = vectorizer.transform(df1['Phrase'])
loaded_model = pickle.load(open(filename, 'rb'))
predicted= loaded_model.predict(vectors)
print(predicted)