# Import packaeges

In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import warnings
warnings.filterwarnings('ignore')

In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Laptop
[nltk_data]     House\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Load datset

In [120]:
df=pd.read_csv('amazon_alexa.tsv',sep='\t')
df.head(2)

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1


# Drop null values

In [121]:
df.dropna(inplace=True)

# Select unnecessary columns

In [122]:
final_df=df[['verified_reviews','feedback']]
final_df.head(2)

Unnamed: 0,verified_reviews,feedback
0,Love my Echo!,1
1,Loved it!,1


# Now apply text preprocesing technique in `verified_reviews` col
- Convert the text into lower case
- Remove pouncation
- Remove stop words

In [123]:
from nltk.stem import PorterStemmer ,WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [124]:
def processor(text):
    """
    we can perform text preprocessing on the input text:
    1. Convert text to lowercase.
    2. Remove punctuation.
    3. Tokenize the text into words.
    4. Remove English stopwords.
    5. Stem and lemmatize the remaining words.
    After completing these step we can join the text and return them
    """
        
    text=text.lower()
    
    # remove pouncation
    translator=str.maketrans('', '', string.punctuation)
    text=text.translate(translator)
    
    
    # Tokenize the word
    token=word_tokenize(text)
    
    # Remove Stop Words
    stop_words=set(stopwords.words('english'))
    update_text=[word for word in token if word.lower() not in stop_words]
    
    # Stem the word
    stem=PorterStemmer()
    stem_text=[stem.stem(word) for word in  update_text]
    
    # Lemitize them
    lem=WordNetLemmatizer()
    final_text=[lem.lemmatize(word) for word in stem_text]
    
    return " ".join(final_text)

# Test the fun

In [125]:
final_df['verified_reviews'].iloc[100]

"Great fun getting to know all the functions of this product.  WOW -- family fun and homework help.  Talking with other grandchildren,who also have an Echo, is a HUGE bonus.  Can't wait to learn more and more and more"

In [156]:
processor(final_df['verified_reviews'].iloc[0])

'love echo'

In [157]:
final_df[final_df['feedback']==0]['verified_reviews'].iloc[3]

"Stopped working after 2 weeks ,didn't follow commands!? Really fun when it was working?"

# Apply the fun
- Now we can see that fun is working correctly so we can add this fun into a pipeline

# Train Test Split

In [129]:
from sklearn.model_selection import train_test_split,cross_val_score

In [130]:
x_train,x_test,y_train,y_test=train_test_split(final_df[['verified_reviews']],final_df['feedback'],test_size=0.2,random_state=45)

In [131]:
x_train.head(2)

Unnamed: 0,verified_reviews
1376,This is a gift. But I have one and love it. ...
2055,I really haven’t found any difference between ...


In [132]:
x_test.head(2)

Unnamed: 0,verified_reviews
141,Not much features.
2653,I love it.


In [133]:
y_train.head(1)

1376    1
Name: feedback, dtype: int64

In [134]:
y_test.head(2)

141     0
2653    1
Name: feedback, dtype: int64

# Build a pipeline

In [135]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [136]:
pipe=Pipeline(steps=[
    ("Vectorize",TfidfVectorizer(max_features=1000,preprocessor=processor)),
])
pipe

# Build a transformer

In [137]:
process=ColumnTransformer(transformers=[
    ("transformation",pipe,'verified_reviews')
],remainder='passthrough')
process

# Build Final pipeline

In [138]:
from sklearn.linear_model import LogisticRegression

In [139]:
final=Pipeline(steps=[
    ("Process",process),
    ("model",LogisticRegression(class_weight='balanced'))
])
final

In [92]:
final.fit(x_train,y_train)

In [93]:
y_pred=final.predict(x_test)
y_pred

array([0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,

In [94]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report,confusion_matrix

In [95]:
accuracy_score(y_test,y_pred)

0.8746031746031746

In [96]:
precision_score(y_test,y_pred)

0.9713740458015268

In [97]:
recall_score(y_test,y_pred)

0.8883071553228621

In [98]:
f1_score(y_test,y_pred)

0.9279854147675479

In [99]:
confusion_matrix(y_test,y_pred)

array([[ 42,  15],
       [ 64, 509]], dtype=int64)

In [100]:
y_test.value_counts()

feedback
1    573
0     57
Name: count, dtype: int64

# Try different models

In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

In [32]:
models={
    "LR":LogisticRegression(class_weight='balanced'),
    "Tree":DecisionTreeClassifier(class_weight='balanced'),
    "RandomForest":RandomForestClassifier(class_weight='balanced'),
    "Knn":KNeighborsClassifier(),
    "MultiNaivebase":MultinomialNB(),
    "Burnali":BernoulliNB()
}

In [33]:
result={
    "model_name":[],
    "accuracy":[],
    "precession":[],
    'recall':[],
    "f1_score":[],
    "confussion_matrix":[]
}

for model_name,model in models.items():
    print(model)
    
    final=Pipeline(steps=[
    ("Process",process),
    ("model",model)
    ])
    
    final.fit(x_train,y_train)
    
    pre=final.predict(x_test)
    
    score=accuracy_score(y_test,pre)
    precession=precision_score(y_test,pre)
    recall=recall_score(y_test,pre)
    f_score=f1_score(y_test,pre)
    matrix=confusion_matrix(y_test,pre)
    
    
    result['model_name'].append(model_name)
    result['accuracy'].append(score)
    result['precession'].append(precession)
    result['recall'].append(recall)
    result['confussion_matrix'].append(matrix)
    result['f1_score'].append(f_score)

LogisticRegression(class_weight='balanced')
DecisionTreeClassifier(class_weight='balanced')
RandomForestClassifier(class_weight='balanced')
KNeighborsClassifier()
MultinomialNB()
BernoulliNB()


In [34]:
df=pd.DataFrame(result)
df

Unnamed: 0,model_name,accuracy,precession,recall,f1_score,confussion_matrix
0,LR,0.874603,0.971374,0.888307,0.927985,"[[42, 15], [64, 509]]"
1,Tree,0.903175,0.968864,0.923211,0.945487,"[[40, 17], [44, 529]]"
2,RandomForest,0.911111,0.943396,0.95986,0.951557,"[[24, 33], [23, 550]]"
3,Knn,0.909524,0.909524,1.0,0.952618,"[[0, 57], [0, 573]]"
4,MultiNaivebase,0.909524,0.909524,1.0,0.952618,"[[0, 57], [0, 573]]"
5,Burnali,0.887302,0.937282,0.938918,0.938099,"[[21, 36], [35, 538]]"


In [35]:
df.sort_values(by='precession',ascending=False)

Unnamed: 0,model_name,accuracy,precession,recall,f1_score,confussion_matrix
0,LR,0.874603,0.971374,0.888307,0.927985,"[[42, 15], [64, 509]]"
1,Tree,0.903175,0.968864,0.923211,0.945487,"[[40, 17], [44, 529]]"
2,RandomForest,0.911111,0.943396,0.95986,0.951557,"[[24, 33], [23, 550]]"
5,Burnali,0.887302,0.937282,0.938918,0.938099,"[[21, 36], [35, 538]]"
3,Knn,0.909524,0.909524,1.0,0.952618,"[[0, 57], [0, 573]]"
4,MultiNaivebase,0.909524,0.909524,1.0,0.952618,"[[0, 57], [0, 573]]"


# Conclussion
- Now we can train different models and here is the result.
- we also know that data is imbalance so we can't relay on accuracy so we can choose those moels which `precession` is high.
- we can see that `LogisticRegression` has high precession now we can choose `LogisticRegression` and fine tune them

# Select Model is `LogisticRegression`
- Now we can do hyperparameter tunning

In [140]:
final=Pipeline(steps=[
    ("Process",process),
    ("model",LogisticRegression(n_jobs=-1,verbose=1,class_weight='balanced',C=0.09))
])
final

In [141]:
final.fit(x_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    4.2s finished


In [142]:
test_pred=final.predict(x_test)

In [143]:
train_pred=final.predict(x_train)

In [144]:
def evulation(actual,pre):
    result={
        "accuracy":accuracy_score(actual,pre),
        "precession":precision_score(actual,pre),
        "recall":recall_score(actual,pre),
        "f1_score":f1_score(actual,pre),
        "confussion_matric":confusion_matrix(actual,pre)
    }
    return result

# train_evulation

In [145]:
evulation(y_train,train_pred)

{'accuracy': 0.8439857086145296,
 'precession': 0.9969056214543579,
 'recall': 0.8331896551724138,
 'f1_score': 0.907724818032402,
 'confussion_matric': array([[ 193,    6],
        [ 387, 1933]], dtype=int64)}

In [146]:
evulation(y_train,train_pred)['confussion_matric']

array([[ 193,    6],
       [ 387, 1933]], dtype=int64)

# test evulation

In [147]:
evulation(y_test,test_pred)

{'accuracy': 0.8317460317460318,
 'precession': 0.9775051124744376,
 'recall': 0.8342059336823735,
 'f1_score': 0.9001883239171375,
 'confussion_matric': array([[ 46,  11],
        [ 95, 478]], dtype=int64)}

In [148]:
evulation(y_test,test_pred)['confussion_matric']

array([[ 46,  11],
       [ 95, 478]], dtype=int64)

# Pickle the model

In [149]:
import pickle as pkl

In [150]:
with open("Sentiment.pkl",'wb') as f:
    pkl.dump(final,f)