## Intro to Machine Learning, determine if clients are satisfied based on their reviews : Text Classifier.

In [1]:
import pandas as pd

filepath_dict = {
    "yelp": "emotions_data/yelp_labelled.txt",
    "amazon": "emotions_data/amazon_cells_labelled.txt",
    "imdb": "emotions_data/yelp_labelled.txt"
}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath,names=["sentence", "label"], sep="\t")
    df['source'] = source
    df_list.append(df)

df = pd.concat(df_list)

df.head()


Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp


In [2]:
sentences = ["Jhon likes ice cream", "Jhon hates chocolate."]

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=0,lowercase=False)
vectorizer.fit(sentences)
vectorizer.vocabulary_

{'Jhon': 0, 'likes': 5, 'ice': 4, 'cream': 2, 'hates': 3, 'chocolate': 1}

In [4]:
vectorizer.transform(sentences).toarray()

array([[1, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0, 0]])

In [5]:
from sklearn.model_selection import train_test_split

df_yelp = df[df["source"]=="yelp"]

sentences = df_yelp["sentence"].values
y = df_yelp["label"].values

sentences_train, sentences_test, y_train, y_test = train_test_split(sentences,y,test_size=0.25,random_state=1000)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

vectorizer.fit(sentences_train)

x_train = vectorizer.transform(sentences_train)
x_test = vectorizer.transform(sentences_test)
x_train

<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>

In [7]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(x_train,y_train)

score = classifier.score(x_test,y_test)

print("Accuracy", score)

Accuracy 0.796


In [8]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7960
