In [1]:
import pandas as pd
import re

from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split


In [2]:
vect = CountVectorizer()

data = pd.read_csv('phone_user_review_file_1.csv',sep=",", encoding='Latin-1')


stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()





In [18]:
"""captuare english language data"""
en_data = data[data['lang'] == 'en']

"""first we remove NaN data from data set"""
# rounding data for example 9.2 > 9 
en_data = en_data.dropna().round()


#sample data from dataset
s_data = en_data[en_data['score'] == 1].copy()
for i in range(2,11):
    tmp = en_data[en_data['score'] == i].reset_index(drop=True)[:5000]
    s_data = s_data.append(tmp, ignore_index=True)

y = s_data['score'].tolist()

In [19]:
new_y = list()
for i in y:
    if (i <= 3):
        new_y.append('weak')
    elif(i <= 6):
        new_y.append('mid')
    else:
        new_y.append('good')
    
y = new_y

In [5]:
def new_finder(char):
    
    num = ord(char)
    
    if (num >= 65) and (num <= 90):
        return chr(num+32)
    if (num >= 97) and (num <= 122):
        return char
    if (num == 32) or (num == 39):
        return char
    return ''
        

def normalizer(text):
    """
        we want remove other letter from text and just keep english letter
        and lower case.
    """
    map_list = map(new_finder, text)
    text = ''.join(map_list)
    
    # tokenizing text and create list of the words
    word_tokens = word_tokenize(text)
    # stemming every words and and choose word if not in stopwords list
    # and delete words that have lesss than 2 letter 
    filtered_sentence = [stemmer.stem(w) for w in word_tokens if not w in stop_words and len(w) > 2]
    
    # finally create string rest of this proccess!
    filtered_sentence = ' '.join(filtered_sentence)[0:]
    return filtered_sentence

In [6]:
def feature(data):
    data_text = data['extract'].tolist()
    feature_list = list(map(normalizer,data_text))
    return feature_list

In [7]:
def vectorize(feature_list):
    X_dtm = vect.fit_transform(feature_list)
    X_dtm = X_dtm.toarray()
    return X_dtm

In [8]:
def feature_selection(k, X_dtm, y):
    global chi2_features
    chi2_features = SelectKBest(chi2, k=k)
    X_kbest_features = chi2_features.fit_transform(X_dtm, y)
    return X_kbest_features


In [9]:
def proc(data):
    tokens = feature(data)
    transform_data = vect.transform(tokens).toarray()
    selection_feature = chi2_features.transform(transform_data)
    return selection_feature


In [20]:
train_data , test_data , y_train, y_test = train_test_split(s_data, y, random_state=3)

In [21]:
X_dtm = vectorize(feature(train_data))

In [22]:
# X_dtm = vectorize(feature(train_data))
X_train = feature_selection(4793, X_dtm, y_train)
X_test = proc(test_data)

In [23]:
X_dtm.shape

(23335, 13128)

In [24]:
clf = MultinomialNB()
clf.fit(X_train,y_train)
predict_val_nb = clf.predict(X_test)

In [25]:
print(classification_report(y_test, predict_val_nb))

              precision    recall  f1-score   support

        good       0.83      0.85      0.84      3972
         mid       0.62      0.66      0.64      2537
        weak       0.59      0.47      0.52      1270

    accuracy                           0.72      7779
   macro avg       0.68      0.66      0.67      7779
weighted avg       0.72      0.72      0.72      7779



In [16]:
data = pd.read_csv('phone_user_review_file_1.csv',sep=",", encoding='Latin-1')

In [54]:
new_data = data.dropna()

In [59]:
compression_opts = dict(method='zip',
                        archive_name='out.csv')
new_data.to_csv('out.zip', index=False,
          compression=compression_opts)

In [60]:
test = pd.read_csv('out.csv',sep=",", encoding='Latin-1')

In [86]:
test['extract'][240]

'Plussaa: +Todella sulava kokemus arkikÃ\x83Â¤ytÃ\x83Â¶ssÃ\x83Â¤ ilman tÃ\x83Â¶kkimisiÃ\x83Â¤ +Akku kestÃ\x83Â¤Ã\x83Â¤ kokopÃ\x83Â¤ivÃ\x83Â¤n vaikka pelaisikin paljon +Kamera Miinusta: -Bixby ja sen paikine -> Google assistant korvaa ja sovellus joka antaa remapata kyseisen napin -Ei stereo kaiutinta -Hinta MikÃ\x83Â¤li hinta ei tunnu liian suolaiselta niin...'

In [64]:
new_data.shap

(360802, 11)