# Data Analysis by sklearn

## Import packages

In [1]:
import os,re

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth',0)

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics 

import nltk
from nltk.corpus import stopwords

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

## Read data

In [2]:
df = pd.read_csv('data/train.csv')
df.head(10)

Unnamed: 0,review_id,review,rating
0,0,Ga disappointed neat products .. Meletot Hilsnyaa Speed ​​of delivery is good.,1
1,1,"Rdtanya replace broken glass, broken chargernya",1
2,2,Nyesel bngt dsni shopping antecedent photo message pictures gk according foto.di existing collagen super fit nyampe holo my house open ehhh collagen contents even in the face pdahal jg description super existing collagen originalnya.pas writing my check lg in photo captions already ma The change ma pictures that the face.,1
3,3,Sent a light blue suit goods ga want a refund,1
4,4,Pendants came with dents and scratches on its surface. The coating looks like it will change colour quickly.,1
5,5,Dg yg depending being sent in photos,1
6,6,Hours not a hologram,1
7,7,"Shop fraudulent business. we put two lamps, one shoe, one nozzle, one wallet, one of gardening mini, 2 bags of fragrant. the total amount including the 144.000d ship. after receipt of goods received only one shoe, one garden mini, 1 straw. still collect enough money but enough goods k. deceptive ah ???",1
8,8,"Well, according to Price",1
9,9,"Pictures and names of elise 7154, but the brand bg balqis other code. X necessarily the same things. Verry disappointed.",1


## Text preprocessing

In [3]:
texts = df['review'].tolist()
labels = df['rating'].tolist()
vectorizer_freq = CountVectorizer(decode_error='ignore',
                                  token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b',
                                  stop_words='english',
                                  lowercase=True,
                                  max_df=0.8,
                                  min_df=5)
train_vsm_freq = vectorizer_freq.fit_transform(texts).toarray()
train_vsm_freq[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [7]:
features = vectorizer_freq.get_feature_names()

In [21]:
df[df['review'].str.contains(r'(\sa{3,})', re.IGNORECASE)]

  return func(self, *args, **kwargs)


Unnamed: 0,review_id,review,rating
3017,3017,There is a dud brg sent. disappointed aaaaassssssssssssssaaaaaa,1
3139,3139,Ssstttttttt ....... Duuuhhh less like goods ... Not rekomend aaahh,1
10302,10302,Housekeeping lamaaaaa \ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,1
16417,16417,Loved it anyway. But his Packingg lamaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa @ aaaaaaaaaaa,2
26507,26507,Brgnya to secure dg. A flexible hny gk n comfortable use. such as in the photo could gk aaat meeting dibentang upright ... mgkn bgtu SDH production dri ... jd somewhat disappointed sdkt ...,2
30239,30239,Makasiiii8iiiiiiiiiiiiihhhhhhhhh yaaaaaaaa aaaaaaaaa,3
33582,33582,"Unyu motive Unyu semuaaaa like AAAA aaaaaaaaaa, inshaAllah next order again",3
34777,34777,"Mkzih sdh kk dg package safe tetima Blm attempted, but mmg cingcau it cool in the stomach. Will continue to make my every day capcin ha aaaaaaa",3
40055,40055,Still good yes but small but tasty really good dipakenya his kayak may well aaammmmiiiiiiin nnn,3
49100,49100,Saaangaaat sssuuuukaaaa coccooook passss Saaangaaat sssuuuuka ssseeeeneeeng ssekaaaaliijj aaakkkuju Thanks seller penggirimanny friendly jg neat and very fast Thank kasiih kakakak sai ... Packing neat product Sm kyk real goods pict Allhamdulillah yes very happy passs product also siiip😍😍ny,3


## Split train-test data

In [4]:
x_train, x_test, y_train, y_test = train_test_split(train_vsm_freq, labels, test_size=0.2)

MemoryError: Unable to allocate 9.30 GiB for an array with shape (117448, 10624) and data type int64

In [22]:
model = MultinomialNB()
model.fit(train_vsm_freq, y_train)

MultinomialNB()

In [24]:
test_vsm_freq = vectorizer_freq.fit_transform(X_test).toarray()
y_predict = model.predict(test_vsm_freq)
ratio = sum(y_predict==y_test)/len(y_test)
print(ratio)

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 6985 is different from 6997)

In [22]:
knn = KNeighborsClassifier()
knn.fit(XX_train, yy_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [36]:
y_predict = knn.predict(XX_test)

In [4]:
eng_stopwords = stopwords.words('english')

def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    words = [w for w in words if w not in eng_stopwords]
    return ' '.join(words)

df['clean_review'] = df.review.apply(clean_text)
df.head()

Unnamed: 0,review_id,review,rating,clean_review
0,0,Ga disappointed neat products .. Meletot Hilsnyaa Speed ​​of delivery is good.,1,ga disappointed neat products meletot hilsnyaa speed delivery good
1,1,"Rdtanya replace broken glass, broken chargernya",1,rdtanya replace broken glass broken chargernya
2,2,Nyesel bngt dsni shopping antecedent photo message pictures gk according foto.di existing collagen super fit nyampe holo my house open ehhh collagen contents even in the face pdahal jg description super existing collagen originalnya.pas writing my check lg in photo captions already ma The change ma pictures that the face.,1,nyesel bngt dsni shopping antecedent photo message pictures gk according foto di existing collagen super fit nyampe holo house open ehhh collagen contents even face pdahal jg description super existing collagen originalnya pas writing check lg photo captions already change pictures face
3,3,Sent a light blue suit goods ga want a refund,1,sent light blue suit goods ga want refund
4,4,Pendants came with dents and scratches on its surface. The coating looks like it will change colour quickly.,1,pendants came dents scratches surface coating looks like change colour quickly


In [5]:
vectorizer_freq = CountVectorizer(decode_error='ignore',
                                  token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b',
                                  stop_words=ord='english',
                                  lowercase=True,
                                  max_df=0.7,
                                  min_df=15)
train_vsm_freq = vectorizer_freq.fit_transform(df.clean_review).toarray()

In [39]:
train_vsm_freq.shape

(146811, 5155)

In [41]:
vectorizer_freq.get_feature_names()

['aa',
 'aaa',
 'aaaaa',
 'aamiin',
 'abal',
 'abis',
 'abit',
 'able',
 'abrasions',
 'abroad',
 'absolutely',
 'absorb',
 'absorbed',
 'absorbent',
 'absorbing',
 'absorbs',
 'abu',
 'abx',
 'accelerated',
 'accept',
 'acceptable',
 'accepted',
 'access',
 'accessories',
 'accidentally',
 'accommodating',
 'accomodating',
 'accordance',
 'according',
 'accordingly',
 'account',
 'accuracy',
 'accurate',
 'acne',
 'acquired',
 'across',
 'act',
 'action',
 'active',
 'activities',
 'actual',
 'actually',
 'ad',
 'ada',
 'adain',
 'adapter',
 'add',
 'added',
 'adding',
 'addition',
 'additional',
 'address',
 'ade',
 'adek',
 'adem',
 'ademm',
 'ademmm',
 'adhesive',
 'adidas',
 'adjust',
 'adjustable',
 'adjusted',
 'admin',
 'adminnya',
 'adminya',
 'admit',
 'adopted',
 'adorable',
 'ads',
 'adult',
 'adults',
 'advance',
 'advanced',
 'advantage',
 'advent',
 'advertised',
 'advertisement',
 'advertising',
 'advice',
 'advise',
 'advised',
 'ae',
 'affairs',
 'affect',
 'affection

In [9]:
X_train, X_test, y_train, y_test = train_test_split(train_vsm_freq, df.rating, test_size=0.3)

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [None]:
y_predict = knn.predict(X_test)
ratio = sum(y_predict==y_test)/len(y_test)
print(ratio)