# <center> Project
    
<center> Anqi Zhu

In [None]:
import re
import numpy as np
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from cleantext import clean
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer

import sklearn
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

## Data Reading

In [None]:
df = pd.read_csv('EMB_goo.csv')
df.head()

## Preprocessing

In [None]:
# drop unuseful column
df = df.drop("Unnamed: 0", axis=1)

# remove emoji
text_list = list(df['content'])
text_list = [clean(i, no_emoji = True) for i in text_list]
df['content'] = text_list

# drop empty reviews
for i in range(df.shape[0]):
    curr_text = df['content'][i]
    if len(curr_text) == 0:
        df = df.drop(i, axis=0)

In [None]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
REMOVE_NUM = re.compile('[\d+]')
STOPWORDS = set(stopwords.words('english'))
en_words = set(nltk.corpus.words.words())
stemmer = PorterStemmer()

def clean_text(text):
    """
    text: a string
    return: modified initial string
    """
    # lowercase text
    text = text.lower() 

    # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) 
    
    # Remove the XXXX values
    text = text.replace('x', '') 
    
    # Remove white space
    text = REMOVE_NUM.sub('', text)

    #  delete symbols which are in BAD_SYMBOLS_RE from text
    text = BAD_SYMBOLS_RE.sub('', text) 

    # delete stopwords from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 

    # Stemming the words
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    
    # removing non-English words
    text = ' '.join(word for word in text.split() if word in en_words)
    
    return text

In [None]:
df['content'] = df['content'].apply(clean_text)

In [None]:
'''
def drop_review(df, na_col=['content'], dup_col=['content', 'userName', 'at']):
    df = df.dropna(subset=na_col) # drop NULL
    df = df.drop_duplicates(subset=dup_col) # drop duplicated reviews
    
    return df

stop_words = set(stopwords.words('english'))
en_words = set(nltk.corpus.words.words('en'))
lemmatizer = WordNetLemmatizer()

def preprocessing(text):
    
    output = []
    for i in range(len(text)):
        text[i] = clean(text[i], no_emoji = True) # remove emojis
        text[i] = text[i].lower() # lowercase
        text[i] = re.sub(r'[^\w\s]','',text[i]) # remove puncuations
        text[i] = re.sub(r'[0-9]+', '', text[i]) # remove numbers
        text[i] = re.sub(r'http\S+', '', text[i]) # remove links
        text[i] = re.sub(r'[^\x00-\x7F]+','', text[i]) # remove non ascii words
        
        # tokens = nltk.word_tokenize(text[i])
        # tokens = [lemmatizer.lemmatize(i) for i in tokens] # lemmatization
        # tokens = [i for i in tokens if i not in stop_words] # remove stop words
        # tokens = [i for i in tokens if i in en_words] # remove non English words
        # tokens = [i for i in tokens if i in en_words or not i.isalpha()] # remove non English words
        # output.append(tokens)
        
    
    return text
         
'''


In [None]:
df

## LDA

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
def print_top_words(model, feature_names, n_top_words):
    tword = []
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        topic_w = " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        tword.append(topic_w)
        print(topic_w)
    return tword

In [None]:
n_features = 1000 # extract 1000 feature names
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                max_features=n_features,
                                stop_words='english',
                                max_df = 0.5,
                                min_df = 10)
tf = tf_vectorizer.fit_transform(df['content'])

In [None]:
n_topics = 8
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50,
                                learning_method='batch',
                                learning_offset=50,
#                                 doc_topic_prior=0.1,
#                                 topic_word_prior=0.01,
                               random_state=0)
lda.fit(tf)

In [None]:
# output feature names of each topic
n_top_words = 25
tf_feature_names = tf_vectorizer.get_feature_names()
topic_word = print_top_words(lda, tf_feature_names, n_top_words)

## 2Vector

In [None]:
my_stop_words = ['game', 'addict', 'like', 'enjoy', 'great', 'love', 'super', 'good', 'best', 'play', 
                 'entertain', 'got', 'cool', 'lot', 'time']
from sklearn.feature_extraction import text 
stop_words = text.ENGLISH_STOP_WORDS.union(my_stop_words)


In [None]:
vectorizer = TfidfVectorizer(sublinear_tf= True, norm='l2', ngram_range = (1,2), max_df=0.2, min_df=10, stop_words=stop_words)
X_train_vc = vectorizer.fit_transform(df["content"])

pd.DataFrame(X_train_vc.toarray(), columns=vectorizer.get_feature_names()).head()

## Clustering

In [None]:
k_clusters = 6

model = KMeans(n_clusters=k_clusters, init='k-means++', n_init=10, max_iter=600, tol=0.000001, random_state=0)
model.fit(X_train_vc)
clusters = model.predict(X_train_vc)
df["ClusterName"] = clusters

In [None]:
def get_top_features_cluster(tf_idf_array, prediction, n_feats):
    labels = np.unique(prediction)
    dfs = []
    for label in labels:
        id_temp = np.where(prediction==label) # indices for each cluster
        x_means = np.mean(tf_idf_array[id_temp], axis = 0) # returns average score across cluster
        sorted_means = np.argsort(x_means)[::-1][:n_feats] # indices with top 20 scores
        features = vectorizer.get_feature_names_out()
        best_features = [(features[i], x_means[i]) for i in sorted_means]
        df = pd.DataFrame(best_features, columns = ['features', 'score'])
        dfs.append(df)
    return dfs


In [None]:
def plotWords(dfs, n_feats):
    for i in range(0, len(dfs)):
        plt.figure(figsize=(8, 2))
        plt.title(("Most Common Words in Cluster {}".format(i)), fontsize=10, fontweight='bold')
        sns.barplot(x = 'score' , y = 'features', orient = 'h' , data = dfs[i][:n_feats])


In [None]:
dfs = get_top_features_cluster(X_train_vc.toarray(), clusters, 15)
plotWords(dfs, 15)

In [None]:
df = pd.read_csv('emb_sentiscore.csv')

In [None]:
df.head()

# Classifier

In [62]:
import pandas as pd
import numpy as np

import re
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from cleantext import clean

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer

# for model-building
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier


from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

import random

## ZAQ

In [2]:
df = pd.read_csv('labelled_all.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,userName,date,content,score,if_apple,if_useful,mechanism,ad,money,...,event,keyboard,IP,time/life,customer service,crush,data,system upgrad,connection,other-tech
0,0,Cre8tiv99,2019-03-11 17:34:05,"Hey guys, this is Bryce “Cre8tiv” Demby. Love ...",3,1,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,eclaitse25,2022-04-05 22:28:14,I don't want to bore you with a long review lo...,4,1,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Jackieee003,2020-11-19 03:12:21,Fav game!! Love the challenges and Disney sett...,5,1,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,tab the great,2021-11-13 03:55:52,This game is great for all ages and fun to play!,5,1,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,Roberts232,2018-03-08 06:26:19,Special Disney emoji earned through play.,3,1,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
df = df[['content', 'if_apple', 'if_useful', 'system upgrad', 'connection', 'other-tech']].copy()
df.head()

Unnamed: 0,content,if_apple,if_useful,system upgrad,connection,other-tech
0,"Hey guys, this is Bryce “Cre8tiv” Demby. Love ...",1,1.0,0.0,0.0,0.0
1,I don't want to bore you with a long review lo...,1,1.0,0.0,0.0,0.0
2,Fav game!! Love the challenges and Disney sett...,1,1.0,0.0,0.0,0.0
3,This game is great for all ages and fun to play!,1,1.0,0.0,0.0,0.0
4,Special Disney emoji earned through play.,1,1.0,0.0,0.0,0.0


In [None]:
df.info()

In [4]:
float_to_int = ['if_useful', 'system upgrad', 'connection', 'other-tech'] 
object_to_str = ['content']
df[float_to_int] = df[float_to_int].astype('int')
df[object_to_str] = df[object_to_str].astype('string')

In [None]:
df['if_useful'].value_counts()

In [5]:
df['if_useful'] = df['if_useful'].map({1:1, 0:0, -1:0})

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   content        5000 non-null   string
 1   if_apple       5000 non-null   int64 
 2   if_useful      5000 non-null   int64 
 3   system upgrad  5000 non-null   int64 
 4   connection     5000 non-null   int64 
 5   other-tech     5000 non-null   int64 
dtypes: int64(5), string(1)
memory usage: 234.5 KB


## Text Clean & Modeling

In [7]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
REMOVE_NUM = re.compile('[\d+]')
STOPWORDS = set(stopwords.words('english'))
en_words = set(nltk.corpus.words.words())
stemmer = PorterStemmer()

def clean_text(text):
    """
    text: a string
    return: modified initial string
    """
    # lowercase text
    text = text.lower() 

    # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) 
    
    # Remove the XXXX values
    text = text.replace('x', '') 
    
    # Remove white space
    text = REMOVE_NUM.sub('', text)

    #  delete symbols which are in BAD_SYMBOLS_RE from text
    text = BAD_SYMBOLS_RE.sub('', text) 

    # delete stopwords from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 

    # Stemming the words
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    
    # removing non-English words
    text = ' '.join(word for word in text.split() if word in en_words)
    
    return text

In [8]:
df['content'] = df['content'].apply(clean_text)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   content        5000 non-null   object
 1   if_apple       5000 non-null   int64 
 2   if_useful      5000 non-null   int64 
 3   system upgrad  5000 non-null   int64 
 4   connection     5000 non-null   int64 
 5   other-tech     5000 non-null   int64 
dtypes: int64(5), object(1)
memory usage: 234.5+ KB


## Modeling

In [10]:
def train_models(X_train, y_train, model):
    
    if model == 'logistic_regression':
        clf = LogisticRegression()
        
    elif model == 'decision_tree':
        clf = tree.DecisionTreeClassifier()
        
    elif model == 'naive_bayes':
        clf = BernoulliNB()
        
    elif model == 'SVC':
        clf = SVC(kernel='linear', probability=True) 
        
    elif model == 'SGD':
        clf = SGDClassifier(loss='modified_huber')
       
    elif model == 'NN': 
        clf = MLPClassifier()
   
    else:
        print('Legit model: logistic_regression, decision_tree, naive_bayes, SVC, SGD, NN')
    
    clf = clf.fit(X_train, y_train)  
    
    return clf
 

In [11]:
def evaluate_models(model, clf, X, y):
    '''
    input: 
        clf
        X = X_valid_vectors_tfidf | X_test_vectors_tfidf
        y = y_valid | y_test
        
    output: precision, recall, F1(macro, micro), acc
    '''
    y_predict = clf.predict(X_valid_vectors_tfidf)
    y_prob = clf.predict_proba(X_valid_vectors_tfidf)[:,1]
    
    print(classification_report(y_valid, y_predict))
    print('Confusion Matrix:\n',confusion_matrix(y_valid, y_predict))
    
    # fpr, tpr, thresholds = roc_curve(y_valid, y_prob)
    # roc_auc = auc(fpr, tpr)
    # print('AUC:', roc_auc) 
    print(md, '-------------------------')
 

In [60]:
def baseline(X_train, y_train, X_test, y_test):
    seq = y_train.unique()
    prob = []
    for i in range(len(seq)):
        curr_value = seq[i]
        prob.append(len([i for i in y_train if i == curr_value]) / len(y_train))
    acc_scores = []
    for i in range(1,10000):
        random_choice = random.choices(seq, weights = prob, k = len(y_test))
        acc_scores.append(accuracy_score(y_test, random_choice))
        
    return np.mean(acc_scores)

### 1-1 content - system upgrad 

In [92]:
X = df['content'] # , 'if_apple', 'if_useful'
y = df['system upgrad']
# np.random.seed(42)
# split the data in training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, stratify=y)
# split the remaining to validation and test 
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, train_size=0.5, stratify=y_rem)

# TF-IDF 
tfidf_vectorizer = TfidfVectorizer()
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_valid_vectors_tfidf = tfidf_vectorizer.transform(X_valid) 
# X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test) 



In [93]:
models = ['logistic_regression', 'decision_tree', 'naive_bayes', 'SVC', 'SGD', 'NN']
for md in models:
    clf = train_models(X_train_vectors_tfidf, y_train, md)
    evaluate_models(md, clf, X_valid_vectors_tfidf, y_valid)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        21
           0       0.98      1.00      0.99       976
           1       0.00      0.00      0.00         3

    accuracy                           0.98      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.95      0.98      0.96      1000

Confusion Matrix:
 [[  0  21   0]
 [  0 976   0]
 [  0   3   0]]
logistic_regression -------------------------
              precision    recall  f1-score   support

          -1       0.06      0.05      0.05        21
           0       0.98      0.98      0.98       976
           1       0.33      0.33      0.33         3

    accuracy                           0.96      1000
   macro avg       0.46      0.45      0.46      1000
weighted avg       0.96      0.96      0.96      1000

Confusion Matrix:
 [[  1  20   0]
 [ 15 959   2]
 [  0   2   1]]
decision_tree -------------------------
              precisio

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        21
           0       0.98      1.00      0.99       976
           1       0.00      0.00      0.00         3

    accuracy                           0.98      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.95      0.98      0.96      1000

Confusion Matrix:
 [[  0  21   0]
 [  0 976   0]
 [  0   3   0]]
SVC -------------------------
              precision    recall  f1-score   support

          -1       0.17      0.10      0.12        21
           0       0.98      0.99      0.98       976
           1       0.00      0.00      0.00         3

    accuracy                           0.97      1000
   macro avg       0.38      0.36      0.37      1000
weighted avg       0.96      0.97      0.96      1000

Confusion Matrix:
 [[  2  19   0]
 [ 10 966   0]
 [  0   3   0]]
SGD -------------------------
              precision    recall  f1-score   su

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [94]:
baseline(X_train_vectors_tfidf, y_train, X_valid_vectors_tfidf, y_valid)

0.9527087708770877

### 1-3 content & apple & useful - system upgrad

In [56]:
X = df[['content', 'if_apple', 'if_useful']]  # , 'if_apple', 'if_useful'
y = df['system upgrad']
# np.random.seed(42)
# split the data in training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, stratify=y)
# split the remaining to validation and test 
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, train_size=0.5, stratify=y_rem)

# TF-IDF 
tfidf_vectorizer = TfidfVectorizer()
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train.iloc[:, 0]) 
X_valid_vectors_tfidf = tfidf_vectorizer.transform(X_valid.iloc[:, 0]) 
# X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test) 


In [57]:
print(X_train_vectors_tfidf.shape)
print(X_valid_vectors_tfidf.shape)

(3000, 1276)
(1000, 1276)


In [58]:
X_train_vectors_tfidf = pd.DataFrame(X_train_vectors_tfidf.toarray()).join(X_train[['if_apple', 'if_useful']].reset_index())
X_valid_vectors_tfidf = pd.DataFrame(X_valid_vectors_tfidf.toarray()).join(X_valid[['if_apple', 'if_useful']].reset_index())
print(X_train_vectors_tfidf.shape)
print(X_valid_vectors_tfidf.shape)

(3000, 1279)
(1000, 1279)


In [59]:
models = ['logistic_regression', 'decision_tree', 'naive_bayes', 'SVC', 'SGD', 'NN']
for md in models:
    clf = train_models(X_train_vectors_tfidf, y_train, md)
    evaluate_models(md, clf, X_valid_vectors_tfidf, y_valid)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        21
           0       0.98      1.00      0.99       976
           1       0.00      0.00      0.00         3

    accuracy                           0.98      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.95      0.98      0.96      1000

Confusion Matrix:
 [[  0  21   0]
 [  0 976   0]
 [  0   3   0]]
logistic_regression -------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.21      0.19      0.20        21
           0       0.98      0.98      0.98       976
           1       0.00      0.00      0.00         3

    accuracy                           0.96      1000
   macro avg       0.40      0.39      0.39      1000
weighted avg       0.96      0.96      0.96      1000

Confusion Matrix:
 [[  4  17   0]
 [ 14 959   3]
 [  1   2   0]]
decision_tree -------------------------
              precision    recall  f1-score   support

          -1       0.08      0.14      0.11        21
           0       0.98      0.97      0.97       976
           1       0.00      0.00      0.00         3

    accuracy                           0.95      1000
   macro avg       0.35      0.37      0.36      1000
weighted avg       0.96      0.95      0.95      1000

Confusion Matrix:
 [[  3  18   0]
 [ 32 944   0]
 [  1   2   0]]
naive_bayes -------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.20      0.05      0.08        21
           0       0.98      1.00      0.99       976
           1       0.00      0.00      0.00         3

    accuracy                           0.97      1000
   macro avg       0.39      0.35      0.35      1000
weighted avg       0.96      0.97      0.96      1000

Confusion Matrix:
 [[  1  20   0]
 [  4 972   0]
 [  0   3   0]]
SVC -------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        21
           0       0.98      1.00      0.99       976
           1       0.00      0.00      0.00         3

    accuracy                           0.98      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.95      0.98      0.96      1000

Confusion Matrix:
 [[  0  21   0]
 [  0 976   0]
 [  0   3   0]]
SGD -------------------------
              precision    recall  f1-score   support

          -1       1.00      0.10      0.17        21
           0       0.98      1.00      0.99       976
           1       0.00      0.00      0.00         3

    accuracy                           0.98      1000
   macro avg       0.66      0.37      0.39      1000
weighted avg       0.98      0.98      0.97      1000

Confusion Matrix:
 [[  2  19   0]
 [  0 976   0]
 [  0   3   0]]
NN -------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [63]:
baseline(X_train_vectors_tfidf, y_train, X_valid_vectors_tfidf, y_valid)

0.9527387738773877

### 1-2 content & apple - system upgrad

In [68]:
X = df[['content', 'if_apple']]  # , 'if_apple', 'if_useful'
y = df['system upgrad']
# np.random.seed(42)
# split the data in training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, stratify=y)
# split the remaining to validation and test 
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, train_size=0.5, stratify=y_rem)

# TF-IDF 
tfidf_vectorizer = TfidfVectorizer()
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train.iloc[:, 0]) 
X_valid_vectors_tfidf = tfidf_vectorizer.transform(X_valid.iloc[:, 0]) 
# X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test) 


In [69]:
X_train_vectors_tfidf = pd.DataFrame(X_train_vectors_tfidf.toarray()).join(X_train[['if_apple']].reset_index())
X_valid_vectors_tfidf = pd.DataFrame(X_valid_vectors_tfidf.toarray()).join(X_valid[['if_apple']].reset_index())
print(X_train_vectors_tfidf.shape)
print(X_valid_vectors_tfidf.shape)

(3000, 1300)
(1000, 1300)


In [70]:
models = ['logistic_regression', 'decision_tree', 'naive_bayes', 'SVC', 'SGD', 'NN']
for md in models:
    clf = train_models(X_train_vectors_tfidf, y_train, md)
    evaluate_models(md, clf, X_valid_vectors_tfidf, y_valid)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        21
           0       0.98      1.00      0.99       976
           1       0.00      0.00      0.00         3

    accuracy                           0.98      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.95      0.98      0.96      1000

Confusion Matrix:
 [[  0  21   0]
 [  0 976   0]
 [  0   3   0]]
logistic_regression -------------------------




              precision    recall  f1-score   support

          -1       0.17      0.10      0.12        21
           0       0.98      0.99      0.98       976
           1       0.00      0.00      0.00         3

    accuracy                           0.97      1000
   macro avg       0.38      0.36      0.37      1000
weighted avg       0.96      0.97      0.96      1000

Confusion Matrix:
 [[  2  19   0]
 [  9 966   1]
 [  1   2   0]]
decision_tree -------------------------
              precision    recall  f1-score   support

          -1       0.08      0.14      0.10        21
           0       0.98      0.96      0.97       976
           1       0.00      0.00      0.00         3

    accuracy                           0.94      1000
   macro avg       0.35      0.37      0.36      1000
weighted avg       0.96      0.94      0.95      1000

Confusion Matrix:
 [[  3  18   0]
 [ 35 940   1]
 [  0   3   0]]
naive_bayes -------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.20      0.05      0.08        21
           0       0.98      1.00      0.99       976
           1       0.00      0.00      0.00         3

    accuracy                           0.97      1000
   macro avg       0.39      0.35      0.35      1000
weighted avg       0.96      0.97      0.96      1000

Confusion Matrix:
 [[  1  20   0]
 [  4 972   0]
 [  0   3   0]]
SVC -------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        21
           0       0.98      1.00      0.99       976
           1       0.00      0.00      0.00         3

    accuracy                           0.98      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.95      0.98      0.96      1000

Confusion Matrix:
 [[  0  21   0]
 [  0 976   0]
 [  0   3   0]]
SGD -------------------------
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        21
           0       0.98      1.00      0.99       976
           1       0.00      0.00      0.00         3

    accuracy                           0.98      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.95      0.98      0.96      1000

Confusion Matrix:
 [[  0  21   0]
 [  0 976   0]
 [  0   3   0]]
NN -------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [71]:
baseline(X_train_vectors_tfidf, y_train, X_valid_vectors_tfidf, y_valid)

0.9527375737573757

### 2-3 content & apple & useful - connection

In [64]:
X = df[['content', 'if_apple', 'if_useful']]  # , 'if_apple', 'if_useful'
y = df['connection']
# np.random.seed(42)
# split the data in training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, stratify=y)
# split the remaining to validation and test 
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, train_size=0.5, stratify=y_rem)

# TF-IDF 
tfidf_vectorizer = TfidfVectorizer()
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train.iloc[:, 0]) 
X_valid_vectors_tfidf = tfidf_vectorizer.transform(X_valid.iloc[:, 0]) 
# X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test) 

In [65]:
X_train_vectors_tfidf = pd.DataFrame(X_train_vectors_tfidf.toarray()).join(X_train[['if_apple', 'if_useful']].reset_index())
X_valid_vectors_tfidf = pd.DataFrame(X_valid_vectors_tfidf.toarray()).join(X_valid[['if_apple', 'if_useful']].reset_index())
print(X_train_vectors_tfidf.shape)
print(X_valid_vectors_tfidf.shape)

(3000, 1298)
(1000, 1298)


In [66]:
models = ['logistic_regression', 'decision_tree', 'naive_bayes', 'SVC', 'SGD', 'NN']
for md in models:
    clf = train_models(X_train_vectors_tfidf, y_train, md)
    evaluate_models(md, clf, X_valid_vectors_tfidf, y_valid)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         9
           0       0.99      1.00      0.99       987
           1       0.00      0.00      0.00         4

    accuracy                           0.99      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.99      0.98      1000

Confusion Matrix:
 [[  0   9   0]
 [  0 987   0]
 [  0   4   0]]
logistic_regression -------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         9
           0       0.99      0.99      0.99       987
           1       0.00      0.00      0.00         4

    accuracy                           0.98      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.98      0.98      1000

Confusion Matrix:
 [[  0   9   0]
 [  4 979   4]
 [  0   4   0]]
decision_tree -------------------------
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         9
           0       0.99      0.99      0.99       987
           1       0.00      0.00      0.00         4

    accuracy                           0.98      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.98      0.98      1000

Confusion Matrix:
 [[  0   9   0]
 [  8 979   0]
 [  0   4   0]]
naive_bayes -------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         9
           0       0.99      1.00      0.99       987
           1       0.00      0.00      0.00         4

    accuracy                           0.99      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.99      0.98      1000

Confusion Matrix:
 [[  0   9   0]
 [  0 987   0]
 [  0   4   0]]
SVC -------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         9
           0       0.99      1.00      0.99       987
           1       0.00      0.00      0.00         4

    accuracy                           0.99      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.99      0.98      1000

Confusion Matrix:
 [[  0   9   0]
 [  0 987   0]
 [  0   4   0]]
SGD -------------------------
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         9
           0       0.99      1.00      0.99       987
           1       0.00      0.00      0.00         4

    accuracy                           0.99      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.99      0.98      1000

Confusion Matrix:
 [[  0   9   0]
 [  0 987   0]
 [  0   4   0]]
NN -------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [67]:
baseline(X_train_vectors_tfidf, y_train, X_valid_vectors_tfidf, y_valid)

0.9742593259325933

### 2-1 content - connection

In [72]:
X = df['content'] # , 'if_apple', 'if_useful'
y = df['connection']
# np.random.seed(42)
# split the data in training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, stratify=y)
# split the remaining to validation and test 
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, train_size=0.5, stratify=y_rem)

# TF-IDF 
tfidf_vectorizer = TfidfVectorizer()
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_valid_vectors_tfidf = tfidf_vectorizer.transform(X_valid) 
# X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test) 



In [73]:
models = ['logistic_regression', 'decision_tree', 'naive_bayes', 'SVC', 'SGD', 'NN']
for md in models:
    clf = train_models(X_train_vectors_tfidf, y_train, md)
    evaluate_models(md, clf, X_valid_vectors_tfidf, y_valid)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.99      1.00      0.99       987
           1       0.00      0.00      0.00         5

    accuracy                           0.99      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.99      0.98      1000

Confusion Matrix:
 [[  0   8   0]
 [  0 987   0]
 [  0   5   0]]
logistic_regression -------------------------
              precision    recall  f1-score   support

          -1       0.17      0.12      0.14         8
           0       0.99      0.99      0.99       987
           1       0.00      0.00      0.00         5

    accuracy                           0.98      1000
   macro avg       0.39      0.37      0.38      1000
weighted avg       0.98      0.98      0.98      1000

Confusion Matrix:
 [[  1   7   0]
 [  4 981   2]
 [  1   4   0]]
decision_tree -------------------------
              precisio

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.99      1.00      0.99       987
           1       0.00      0.00      0.00         5

    accuracy                           0.99      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.99      0.98      1000

Confusion Matrix:
 [[  0   8   0]
 [  0 987   0]
 [  0   5   0]]
SVC -------------------------
              precision    recall  f1-score   support

          -1       0.50      0.25      0.33         8
           0       0.99      1.00      0.99       987
           1       0.00      0.00      0.00         5

    accuracy                           0.99      1000
   macro avg       0.50      0.42      0.44      1000
weighted avg       0.98      0.99      0.98      1000

Confusion Matrix:
 [[  2   6   0]
 [  2 984   1]
 [  0   5   0]]
SGD -------------------------
              precision    recall  f1-score   su

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [74]:
baseline(X_train_vectors_tfidf, y_train, X_valid_vectors_tfidf, y_valid)

0.9742722272227222

### 2-2 content & apple - connection

In [95]:
X = df[['content', 'if_apple']]  # , 'if_apple', 'if_useful'
y = df['connection']
# np.random.seed(42)
# split the data in training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, stratify=y)
# split the remaining to validation and test 
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, train_size=0.5, stratify=y_rem)

# TF-IDF 
tfidf_vectorizer = TfidfVectorizer()
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train.iloc[:, 0]) 
X_valid_vectors_tfidf = tfidf_vectorizer.transform(X_valid.iloc[:, 0]) 
# X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test) 


In [96]:
X_train_vectors_tfidf = pd.DataFrame(X_train_vectors_tfidf.toarray()).join(X_train[['if_apple']].reset_index())
X_valid_vectors_tfidf = pd.DataFrame(X_valid_vectors_tfidf.toarray()).join(X_valid[['if_apple']].reset_index())
print(X_train_vectors_tfidf.shape)
print(X_valid_vectors_tfidf.shape)

(3000, 1329)
(1000, 1329)


In [98]:
models = ['logistic_regression', 'decision_tree', 'SVC', 'naive_bayes', 'SGD', 'NN'] 
for md in models:
    clf = train_models(X_train_vectors_tfidf, y_train, md)
    evaluate_models(md, clf, X_valid_vectors_tfidf, y_valid)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.99      1.00      0.99       987
           1       0.00      0.00      0.00         5

    accuracy                           0.99      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.99      0.98      1000

Confusion Matrix:
 [[  0   8   0]
 [  0 987   0]
 [  0   5   0]]
logistic_regression -------------------------




              precision    recall  f1-score   support

          -1       0.33      0.25      0.29         8
           0       0.99      0.99      0.99       987
           1       0.00      0.00      0.00         5

    accuracy                           0.98      1000
   macro avg       0.44      0.41      0.42      1000
weighted avg       0.98      0.98      0.98      1000

Confusion Matrix:
 [[  2   6   0]
 [  4 975   8]
 [  0   5   0]]
decision_tree -------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.99      1.00      0.99       987
           1       0.00      0.00      0.00         5

    accuracy                           0.99      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.99      0.98      1000

Confusion Matrix:
 [[  0   8   0]
 [  0 987   0]
 [  0   5   0]]
SVC -------------------------
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.99      0.99      0.99       987
           1       0.00      0.00      0.00         5

    accuracy                           0.98      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.98      0.98      1000

Confusion Matrix:
 [[  0   8   0]
 [  6 981   0]
 [  0   5   0]]
naive_bayes -------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.99      1.00      0.99       987
           1       0.00      0.00      0.00         5

    accuracy                           0.99      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.99      0.98      1000

Confusion Matrix:
 [[  0   8   0]
 [  0 987   0]
 [  0   5   0]]
SGD -------------------------
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.99      1.00      0.99       987
           1       0.00      0.00      0.00         5

    accuracy                           0.99      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.99      0.98      1000

Confusion Matrix:
 [[  0   8   0]
 [  0 987   0]
 [  0   5   0]]
NN -------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [78]:
baseline(X_train_vectors_tfidf, y_train, X_valid_vectors_tfidf, y_valid)

0.9742720272027203

### 3-3 content & apple & useful - other-tech

In [80]:
X = df[['content', 'if_apple', 'if_useful']]  # , 'if_apple', 'if_useful'
y = df['other-tech']
# np.random.seed(42)
# split the data in training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, stratify=y)
# split the remaining to validation and test 
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, train_size=0.5, stratify=y_rem)

# TF-IDF 
tfidf_vectorizer = TfidfVectorizer()
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train.iloc[:, 0]) 
X_valid_vectors_tfidf = tfidf_vectorizer.transform(X_valid.iloc[:, 0]) 
# X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test) 

In [81]:
X_train_vectors_tfidf = pd.DataFrame(X_train_vectors_tfidf.toarray()).join(X_train[['if_apple', 'if_useful']].reset_index())
X_valid_vectors_tfidf = pd.DataFrame(X_valid_vectors_tfidf.toarray()).join(X_valid[['if_apple', 'if_useful']].reset_index())
print(X_train_vectors_tfidf.shape)
print(X_valid_vectors_tfidf.shape)

(3000, 1264)
(1000, 1264)


In [99]:
models = ['logistic_regression', 'decision_tree', 'naive_bayes', 'SVC', 'SGD', 'NN'] 
for md in models:
    clf = train_models(X_train_vectors_tfidf, y_train, md)
    evaluate_models(md, clf, X_valid_vectors_tfidf, y_valid)0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.99      1.00      0.99       987
           1       0.00      0.00      0.00         5

    accuracy                           0.99      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.99      0.98      1000

Confusion Matrix:
 [[  0   8   0]
 [  0 987   0]
 [  0   5   0]]
logistic_regression -------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.25      0.38      0.30         8
           0       0.99      0.98      0.99       987
           1       0.00      0.00      0.00         5

    accuracy                           0.97      1000
   macro avg       0.41      0.45      0.43      1000
weighted avg       0.98      0.97      0.98      1000

Confusion Matrix:
 [[  3   3   2]
 [  8 972   7]
 [  1   4   0]]
decision_tree -------------------------
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.99      0.99      0.99       987
           1       0.00      0.00      0.00         5

    accuracy                           0.98      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.98      0.98      1000

Confusion Matrix:
 [[  0   8   0]
 [  6 981   0]
 [  0   5   0]]
naive_bayes -------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.99      1.00      0.99       987
           1       0.00      0.00      0.00         5

    accuracy                           0.99      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.99      0.98      1000

Confusion Matrix:
 [[  0   8   0]
 [  0 987   0]
 [  0   5   0]]
SVC -------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.99      1.00      0.99       987
           1       0.00      0.00      0.00         5

    accuracy                           0.99      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.99      0.98      1000

Confusion Matrix:
 [[  0   8   0]
 [  0 987   0]
 [  0   5   0]]
SGD -------------------------
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.99      1.00      0.99       987
           1       0.00      0.00      0.00         5

    accuracy                           0.99      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.99      0.98      1000

Confusion Matrix:
 [[  0   8   0]
 [  0 987   0]
 [  0   5   0]]
NN -------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [83]:
baseline(X_train_vectors_tfidf, y_train, X_valid_vectors_tfidf, y_valid)

0.9064171417141715

### 3-1 content - other-tech

In [101]:
X = df[['content']]  # , 'if_apple', 'if_useful'
y = df['other-tech']
# np.random.seed(42)
# split the data in training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, stratify=y)
# split the remaining to validation and test 
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, train_size=0.5, stratify=y_rem)

# TF-IDF 
tfidf_vectorizer = TfidfVectorizer()
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train.iloc[:, 0]) 
X_valid_vectors_tfidf = tfidf_vectorizer.transform(X_valid.iloc[:, 0]) 
# X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test) 

In [102]:
models = ['logistic_regression', 'decision_tree', 'naive_bayes', 'SVC', 'SGD', 'NN']
for md in models:
    clf = train_models(X_train_vectors_tfidf, y_train, md)
    evaluate_models(md, clf, X_valid_vectors_tfidf, y_valid)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.50      0.02      0.04        46
           0       0.95      1.00      0.97       951
           1       0.00      0.00      0.00         3

    accuracy                           0.95      1000
   macro avg       0.48      0.34      0.34      1000
weighted avg       0.93      0.95      0.93      1000

Confusion Matrix:
 [[  1  45   0]
 [  1 950   0]
 [  0   3   0]]
logistic_regression -------------------------
              precision    recall  f1-score   support

          -1       0.18      0.17      0.18        46
           0       0.96      0.96      0.96       951
           1       0.00      0.00      0.00         3

    accuracy                           0.92      1000
   macro avg       0.38      0.38      0.38      1000
weighted avg       0.92      0.92      0.92      1000

Confusion Matrix:
 [[  8  37   1]
 [ 37 913   1]
 [  0   3   0]]
decision_tree -------------------------
              precisio

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        46
           0       0.95      1.00      0.97       951
           1       0.00      0.00      0.00         3

    accuracy                           0.95      1000
   macro avg       0.32      0.33      0.32      1000
weighted avg       0.90      0.95      0.93      1000

Confusion Matrix:
 [[  0  46   0]
 [  0 951   0]
 [  0   3   0]]
SVC -------------------------
              precision    recall  f1-score   support

          -1       0.28      0.17      0.21        46
           0       0.96      0.98      0.97       951
           1       0.00      0.00      0.00         3

    accuracy                           0.94      1000
   macro avg       0.41      0.38      0.39      1000
weighted avg       0.92      0.94      0.93      1000

Confusion Matrix:
 [[  8  38   0]
 [ 21 929   1]
 [  0   3   0]]
SGD -------------------------
              precision    recall  f1-score   su

In [87]:
baseline(X_train_vectors_tfidf, y_train, X_valid_vectors_tfidf, y_valid)

0.9064412441244125

### 3-2 content & apple - other-tech

In [88]:
X = df[['content', 'if_apple']]  # , 'if_apple', 'if_useful'
y = df['other-tech']
# np.random.seed(42)
# split the data in training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, stratify=y)
# split the remaining to validation and test 
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, train_size=0.5, stratify=y_rem)

# TF-IDF 
tfidf_vectorizer = TfidfVectorizer()
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train.iloc[:, 0]) 
X_valid_vectors_tfidf = tfidf_vectorizer.transform(X_valid.iloc[:, 0]) 
# X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test) 

In [89]:
X_train_vectors_tfidf = pd.DataFrame(X_train_vectors_tfidf.toarray()).join(X_train[['if_apple']].reset_index())
X_valid_vectors_tfidf = pd.DataFrame(X_valid_vectors_tfidf.toarray()).join(X_valid[['if_apple']].reset_index())
print(X_train_vectors_tfidf.shape)
print(X_valid_vectors_tfidf.shape)

(3000, 1283)
(1000, 1283)


In [100]:
models = ['logistic_regression', 'decision_tree', 'naive_bayes', 'SVC', 'SGD', 'NN'] # 
for md in models:
    clf = train_models(X_train_vectors_tfidf, y_train, md)
    evaluate_models(md, clf, X_valid_vectors_tfidf, y_valid)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.99      1.00      0.99       987
           1       0.00      0.00      0.00         5

    accuracy                           0.99      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.99      0.98      1000

Confusion Matrix:
 [[  0   8   0]
 [  0 987   0]
 [  0   5   0]]
logistic_regression -------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.29      0.25      0.27         8
           0       0.99      0.99      0.99       987
           1       0.00      0.00      0.00         5

    accuracy                           0.97      1000
   macro avg       0.42      0.41      0.42      1000
weighted avg       0.98      0.97      0.98      1000

Confusion Matrix:
 [[  2   6   0]
 [  5 973   9]
 [  0   5   0]]
decision_tree -------------------------
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.99      0.99      0.99       987
           1       0.00      0.00      0.00         5

    accuracy                           0.98      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.98      0.98      1000

Confusion Matrix:
 [[  0   8   0]
 [  6 981   0]
 [  0   5   0]]
naive_bayes -------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.99      1.00      0.99       987
           1       0.00      0.00      0.00         5

    accuracy                           0.99      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.99      0.98      1000

Confusion Matrix:
 [[  0   8   0]
 [  0 987   0]
 [  0   5   0]]
SVC -------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.99      1.00      0.99       987
           1       0.00      0.00      0.00         5

    accuracy                           0.99      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.99      0.98      1000

Confusion Matrix:
 [[  0   8   0]
 [  0 987   0]
 [  0   5   0]]
SGD -------------------------
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         8
           0       0.99      1.00      0.99       987
           1       0.00      0.00      0.00         5

    accuracy                           0.99      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.97      0.99      0.98      1000

Confusion Matrix:
 [[  0   8   0]
 [  0 987   0]
 [  0   5   0]]
NN -------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [91]:
baseline(X_train_vectors_tfidf, y_train, X_valid_vectors_tfidf, y_valid)

0.9065241524152414

In [108]:
df_all = pd.read_csv('EMB_all.csv')
df_all.head()

Unnamed: 0.1,Unnamed: 0,userName,date,content,score,if_apple,if_useful,crush,data,keyboard,time/life,IP
0,0,Jocelyn Sy,2022-10-27 18:10:20,fun,5,0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,crystal may,2022-10-27 12:17:07,aw matter game close progress spent coin gem l...,1,0,1.0,0.0,-1.0,0.0,0.0,0.0
2,2,Coreter,2022-10-27 07:07:17,cool,5,0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Jordan Christie,2022-10-26 15:11:40,use love game recent new go sync progress acco...,2,0,1.0,0.0,0.0,0.0,0.0,0.0
4,4,Heather Nicole,2022-10-26 03:42:00,love game dont know shut time clear make sure ...,4,0,1.0,0.0,0.0,0.0,0.0,0.0


In [109]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170701 entries, 0 to 170700
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  170701 non-null  int64  
 1   userName    170701 non-null  object 
 2   date        170701 non-null  object 
 3   content     163658 non-null  object 
 4   score       170701 non-null  int64  
 5   if_apple    170701 non-null  int64  
 6   if_useful   170701 non-null  float64
 7   crush       170701 non-null  float64
 8   data        170701 non-null  float64
 9   keyboard    170701 non-null  float64
 10  time/life   170701 non-null  float64
 11  IP          170701 non-null  float64
dtypes: float64(6), int64(3), object(3)
memory usage: 15.6+ MB


In [111]:
df_all['content'] = df_all['content'].fillna(' ').astype('string')
df_all['content'] = df_all['content'].apply(clean_text)
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170701 entries, 0 to 170700
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  170701 non-null  int64  
 1   userName    170701 non-null  object 
 2   date        170701 non-null  object 
 3   content     170701 non-null  object 
 4   score       170701 non-null  int64  
 5   if_apple    170701 non-null  int64  
 6   if_useful   170701 non-null  float64
 7   crush       170701 non-null  float64
 8   data        170701 non-null  float64
 9   keyboard    170701 non-null  float64
 10  time/life   170701 non-null  float64
 11  IP          170701 non-null  float64
dtypes: float64(6), int64(3), object(3)
memory usage: 15.6+ MB


In [112]:
# 1-1 DT 

X = df['content'] 
y = df['system upgrad']
# np.random.seed(42)
# split the data in training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, stratify=y)
# split the remaining to validation and test 
# X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, train_size=0.5, stratify=y_rem)

# TF-IDF 
tfidf_vectorizer = TfidfVectorizer()
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
# X_valid_vectors_tfidf = tfidf_vectorizer.transform(X_valid) 
# X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test) 
X_pred_vectors_tfidf = tfidf_vectorizer.transform(df_all['content']) 

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train_vectors_tfidf, y_train)  
y_predict_sys = clf.predict(X_pred_vectors_tfidf)
   

In [116]:
# 2-1 SGD

X = df['content'] 
y = df['connection']
# np.random.seed(42)
# split the data in training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, stratify=y)
# split the remaining to validation and test 
# X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, train_size=0.5, stratify=y_rem)

# TF-IDF 
tfidf_vectorizer = TfidfVectorizer()
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
# X_valid_vectors_tfidf = tfidf_vectorizer.transform(X_valid) 
# X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test) 
X_pred_vectors_tfidf = tfidf_vectorizer.transform(df_all['content']) 

clf = SGDClassifier(loss='modified_huber')
clf = clf.fit(X_train_vectors_tfidf, y_train)  
y_predict_conn = clf.predict(X_pred_vectors_tfidf)
   

In [117]:
# 3-3 DT

X = df[['content', 'if_apple', 'if_useful']] 
y = df['other-tech']
# np.random.seed(42)
# split the data in training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, stratify=y)
# split the remaining to validation and test 
# X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, train_size=0.5, stratify=y_rem)

# TF-IDF 
tfidf_vectorizer = TfidfVectorizer()
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train.iloc[:, 0]) 
# X_valid_vectors_tfidf = tfidf_vectorizer.transform(X_valid.iloc[:, 0]) 
# X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test) 
X_pred_vectors_tfidf = tfidf_vectorizer.transform(df_all['content']) 

X_train_vectors_tfidf = pd.DataFrame(X_train_vectors_tfidf.toarray()).join(X_train[['if_apple', 'if_useful']].reset_index())
X_pred_vectors_tfidf = pd.DataFrame(X_pred_vectors_tfidf.toarray()).join(df_all[['if_apple', 'if_useful']].reset_index())
print(X_train_vectors_tfidf.shape)
print(X_pred_vectors_tfidf.shape)


clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train_vectors_tfidf, y_train)  
y_predict_other = clf.predict(X_pred_vectors_tfidf)
   

(3000, 1326)
(170701, 1326)




In [127]:
df_output = pd.read_csv('EMB_all.csv')

In [128]:
df_output['system upgrad'] = y_predict_sys
df_output['connection'] = y_predict_conn
df_output['other-tech'] = y_predict_other

In [129]:
df_output.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170701 entries, 0 to 170700
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Unnamed: 0     170701 non-null  int64  
 1   userName       170701 non-null  object 
 2   date           170701 non-null  object 
 3   content        163658 non-null  object 
 4   score          170701 non-null  int64  
 5   if_apple       170701 non-null  int64  
 6   if_useful      170701 non-null  float64
 7   crush          170701 non-null  float64
 8   data           170701 non-null  float64
 9   keyboard       170701 non-null  float64
 10  time/life      170701 non-null  float64
 11  IP             170701 non-null  float64
 12  system upgrad  170701 non-null  int64  
 13  connection     170701 non-null  int64  
 14  other-tech     170701 non-null  int64  
dtypes: float64(6), int64(6), object(3)
memory usage: 19.5+ MB


In [135]:
df_output.to_csv('EMB_all_zaqzaq.csv', index = False)

In [137]:
pd.read_csv('EMB_all_zaqzaq.csv').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170701 entries, 0 to 170700
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Unnamed: 0     170701 non-null  int64  
 1   userName       170701 non-null  object 
 2   date           170701 non-null  object 
 3   content        163658 non-null  object 
 4   score          170701 non-null  int64  
 5   if_apple       170701 non-null  int64  
 6   if_useful      170701 non-null  float64
 7   crush          170701 non-null  float64
 8   data           170701 non-null  float64
 9   keyboard       170701 non-null  float64
 10  time/life      170701 non-null  float64
 11  IP             170701 non-null  float64
 12  system upgrad  170701 non-null  int64  
 13  connection     170701 non-null  int64  
 14  other-tech     170701 non-null  int64  
dtypes: float64(6), int64(6), object(3)
memory usage: 19.5+ MB
