# Text Classification With Python and Keras

## importing the libraries 

In [165]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression


## Loading the data with pandas

In [166]:
filepath_dict = {
        'yelp':   'data/yelp_labelled.txt',
        'amazon': 'data/amazon_cells_labelled.txt',
        'imdb':   'data/imdb_labelled.txt'
}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath,names=['sentence', 'label'], sep='\t')
    df['source'] = source
    df_list.append(df)
df = pd.concat(df_list)
print(df.loc[0])

                                            sentence  label  source
0                           Wow... Loved this place.      1    yelp
0  So there is no way for me to plug it in here i...      0  amazon
0  A very, very, very slow-moving, aimless movie ...      0    imdb


## spliting the data

### Training yelp data


In [167]:
df_yelp = df[df['source']=='yelp']
X = df_yelp['sentence'].values
Y = df_yelp['label'].values

x_train,x_test,y_train,y_test = train_test_split(X,Y, test_size=0.2 , random_state=100)

### Shape yelp data by sklearn feature vectors for each sentence of training and texting set

In [168]:
vectorizer = CountVectorizer()
vectorizer.fit(x_train)

X_train = vectorizer.transform(x_train)
X_test = vectorizer.transform(x_test)

### classify the model by logistic regression

In [169]:
classifier = LogisticRegression()
classifier.fit(X_train,y_train)
score = classifier.score(X_test,y_test)
print("Accuracy:", score)

Accuracy: 0.835


## Training amazon data

In [170]:
df_amazon = df[df['source']=='amazon']
X = df_amazon['sentence'].values
Y = df_amazon['label'].values

x_train,x_test,y_train,y_test = train_test_split(X,Y, test_size=0.2 , random_state=100)

### Shape yelp data by sklearn feature vectors for each sentence of training and texting set

In [171]:
vectorizer = CountVectorizer()
vectorizer.fit(x_train)

X_train = vectorizer.transform(x_train)
X_test = vectorizer.transform(x_test)

### classify the model by logistic regression

In [172]:
classifier = LogisticRegression()
classifier.fit(X_train,y_train)
score = classifier.score(X_test,y_test)
print("Accuracy:", score)

Accuracy: 0.845


## Training imdb data

In [173]:
df_imdb = df[df['source']=='imdb']
X = df_imdb['sentence'].values
Y = df_imdb['label'].values

x_train,x_test,y_train,y_test = train_test_split(X,Y, test_size=0.2 , random_state=100)

### Shape yelp data by sklearn feature vectors for each sentence of training and texting set

In [174]:
vectorizer = CountVectorizer()
vectorizer.fit(x_train)

X_train = vectorizer.transform(x_train)
X_test = vectorizer.transform(x_test)

### classify the model by logistic regression

In [175]:
classifier = LogisticRegression()
classifier.fit(X_train,y_train)
score = classifier.score(X_test,y_test)
print("Accuracy:", score)

Accuracy: 0.7466666666666667
