In [1]:
import pandas as pd

# Importing dataset

In [2]:
df = pd.read_csv("20191226-reviews.csv")
df.head(5)

Unnamed: 0,rating,title
0,3,"Def not best, but not worst"
1,1,Text Messaging Doesn't Work
2,5,Love This Phone
3,3,"Love the Phone, BUT...!"
4,4,"Great phone service and options, lousy case!"


# Converting the data into string as it might contain some other types of data

In [3]:
df['title']=df['title'].apply(str)

# Pre Processing of Dataset
## Tokenization (divinding the string in individual words)
## Stop Word Removal (is, a, an, the, not removal)
## Punctuation Removal
## Stemming and Lematization

In [4]:
# Cleaning the texts
import re
import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
list1=[]
for i in range(0,len(df)):
    
    #replace non-letter with space
    review = re.sub('[^a-zA-Z]', ' ', df['title'][i])
    
    #convert all to lower case
    review=review.lower()
    
    #split
    review=review.split()
    
    #stemming and Lemmanization
    ps=PorterStemmer()
    lm = WordNetLemmatizer()
    
    
    review = [lm.lemmatize(word) for word in review if word not in set(stopwords.words('english'))] 
    review=' '.join(review)
    list1.append(review)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
df["reviews"]=list1

In [6]:
df.drop("title", axis=1, inplace=True)

In [7]:
df.head()

Unnamed: 0,rating,reviews
0,3,def best worst
1,1,text messaging work
2,5,love phone
3,3,love phone
4,4,great phone service option lousy case


# Feature Extraction
## TF-IDF Vectorization

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
x = tfidf.fit_transform(list1).toarray()
y = df.iloc[:, 1].values

# Training Model

In [22]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20)

# Applying SVM

In [23]:
from sklearn.svm import LinearSVC
lvc=LinearSVC()
lvc.fit(x_train,y_train)
print(f"The accuracy of model is: {lvc.score(x_test,y_test)*100:.2f}%")

The accuracy of model is: 41.50%


# Applying Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=100)
rfc.fit(x_train,y_train)
print(f"The accuracy of model is: {rfc.score(x_test,y_test)*100:.2f}%")

The accuracy of model is: 44.80%
