In [None]:
import pandas as pd
import numpy as np
import nltk
import re


from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer

from nltk.corpus import stopwords
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

In [2]:
data=pd.read_csv("review_data.csv")
data=data[["text","label"]]


data["text"]=data["text"].str.lower()
print(data.head())

                                                text     label
0  okay i\u2019m sorry but taylor swift looks not...  negative
1  @user the dc comics site has batman 44 release...   neutral
2  "frank gaffrey\u002c cliff may\u002c steve eme...  positive
3  the tragedy of only thinking up hilarious twee...  negative
4  "oliseh meets with victor moses in london: sup...   neutral


In [3]:

data=data[data["label"]!="neutral"]

data["label"] = np.where(data["label"] == "positive", 1,0)

data.head()

Unnamed: 0,text,label
0,okay i\u2019m sorry but taylor swift looks not...,0
2,"""frank gaffrey\u002c cliff may\u002c steve eme...",1
3,the tragedy of only thinking up hilarious twee...,0
5,"""people always forget the fact that shawn achi...",1
6,it looks like a beautiful night to throw mysel...,0


##word to vectors


-tokenization
-remove stopwords ,punctuation marks,specialcharecters
-stemming,lemmatization

techniques:
-bow(countVectorizer)
-tfidf
-word2vec

In [4]:

lemma=WordNetLemmatizer()
stop_words=set(stopwords.words("english"))

def clean_text(text):
  text=text.lower()
  text=re.sub(r"[^a-zA-Z0-9]"," ",text)
  words=nltk.word_tokenize(text) 
  return " ".join([lemma.lemmatize(word) for word in words if word not in stop_words])



data["text"]=data["text"].apply(lambda x:clean_text(x))
data["text"].head()

0    okay u2019m sorry taylor swift look nothing li...
2    frank gaffrey u002c cliff may u002c steve emer...
3    tragedy thinking hilarious tweet summer olympi...
5    people always forget fact shawn achieved much ...
6    look like beautiful night throw brooklyn bridg...
Name: text, dtype: object

In [5]:

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,r2_score,confusion_matrix,classification_report



from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [6]:
x_train,x_test,y_train,y_test=train_test_split(data["text"],data["label"],test_size=0.3,random_state=42)
print(x_train.shape)
print(x_test.shape)

(1009,)
(433,)


In [7]:
def training(x_train_vector,x_test_vector,y_train,y_test):

  models={"naivebayes":MultinomialNB(),
        "logisticregression":LogisticRegression(),
        "svm":SVC(),"decisiontree":DecisionTreeClassifier(),
        "randomforest":RandomForestClassifier(),
        "knn":KNeighborsClassifier()
        }

  for name,model in models.items():
     
    model.fit(x_train_vector,y_train)
    y_test_pred = model.predict(x_test_vector)
    test_model_score = r2_score(y_test, y_test_pred)

    print(name+":")
    print("r2 score:",test_model_score)
    print(confusion_matrix(y_test,y_test_pred))
    print(classification_report(y_test,y_test_pred))
    print("-----------------------------------------------\n\n")

In [8]:
convert_vectors={"countVectorizer":CountVectorizer(max_features=500, binary=False ),"tfidf":TfidfVectorizer(max_features=100,ngram_range=(1,3))}



for name,vector in convert_vectors.items():
  print("vectorizer",name,":::")
  vectorizer=vector
  x_train_vector=vectorizer.fit_transform(x_train).toarray()
  x_test_vector=vectorizer.transform(x_test).toarray()
  training(x_train_vector,x_test_vector,y_train,y_test)

vectorizer countVectorizer :::
naivebayes:
r2 score: -0.21068470929736227
[[142  70]
 [ 61 160]]
              precision    recall  f1-score   support

           0       0.70      0.67      0.68       212
           1       0.70      0.72      0.71       221

    accuracy                           0.70       433
   macro avg       0.70      0.70      0.70       433
weighted avg       0.70      0.70      0.70       433

-----------------------------------------------


logisticregression:
r2 score: -0.30310338939639747
[[144  68]
 [ 73 148]]
              precision    recall  f1-score   support

           0       0.66      0.68      0.67       212
           1       0.69      0.67      0.68       221

    accuracy                           0.67       433
   macro avg       0.67      0.67      0.67       433
weighted avg       0.67      0.67      0.67       433

-----------------------------------------------


svm:
r2 score: -0.34931272944591507
[[145  67]
 [ 79 142]]
              pr