# Machine Learning
## Import pandas library and sklearn

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## Declare files path

In [2]:
filepath_dict= {'yelp':'data/yelp_labelled.txt',
                'amazon':'data/amazon_cells_labelled.txt',
                'imdb':'data/imdb_labelled.txt'}

In [5]:
filepath_dict

{'yelp': 'data/yelp_labelled.txt',
 'amazon': 'data/amazon_cells_labelled.txt',
 'imdb': 'data/imdb_labelled.txt'}

## Read files data

In [18]:
dfs=[]
for src, path in filepath_dict.items():
    df=pd.read_csv(path,names=['sentence','label'],sep='\t')
    df["source"] = src
    dfs.append(df)
df = pd.concat(dfs)
print(df.iloc[0])
    

sentence    Wow... Loved this place.
label                              1
source                          yelp
Name: 0, dtype: object


## Declare the victor

In [13]:
sentences = ['John likes ice cream', 'John hates chocolate.']

In [15]:
vectorizer=CountVectorizer(min_df=0,lowercase=False)
vectorizer

CountVectorizer(lowercase=False, min_df=0)

## Fitting the data

In [20]:
vectorizer.fit(sentences)
vectorizer.vocabulary_

{'John': 0, 'likes': 5, 'ice': 4, 'cream': 2, 'hates': 3, 'chocolate': 1}

In [22]:
vectorizer.transform(sentences).toarray()

array([[1, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0, 0]])

# Defining a Baseline Model

## Yelp data

In [23]:
df_yelp = df[df['source'] == 'yelp']
df_yelp

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
995,I think food should have flavor and texture an...,0,yelp
996,Appetite instantly gone.,0,yelp
997,Overall I was not impressed and would not go b...,0,yelp
998,"The whole experience was underwhelming, and I ...",0,yelp


In [25]:
sentences = df_yelp['sentence'].values
#sentences

In [27]:
y = df_yelp['label'].values
#y

In [33]:
x_train , x_test , y_train , y_test = train_test_split(sentences, y, test_size=0.4, random_state=100)

#x_train
# x_test
# y_train
# y_test

In [34]:
vectorizer = CountVectorizer()
vectorizer

CountVectorizer()

In [35]:
vectorizer.fit(x_train)
vectorizer

CountVectorizer()

In [36]:
X_train = vectorizer.transform(x_train)
X_train

<600x1510 sparse matrix of type '<class 'numpy.int64'>'
	with 5858 stored elements in Compressed Sparse Row format>

In [37]:
X_test  = vectorizer.transform(x_test)
X_test

<400x1510 sparse matrix of type '<class 'numpy.int64'>'
	with 3323 stored elements in Compressed Sparse Row format>

In [38]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
score

print("Accuracy:", score)

Accuracy: 0.8075


In [39]:
for source in df['source'].unique():
    df_source=df[df['source']==source]
    sentences =df_source['sentence'].values
    y = df_source['label'].values
    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)
    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487
