In [1]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import gensim
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
import csv

import sys

sys.path.append('../notebooks/Functions/')
from TextMiningProcesses import column_lemmatizer

In [11]:
df = pd.read_json("../data/Appliances.json", lines = True)
df = df.dropna(subset='reviewText')

vectorizer = CountVectorizer(stop_words='english')

features = df['reviewText'].iloc[0:150000]
target = df['overall'].iloc[0:150000]

lemmed_features = column_lemmatizer(features)

vectored_features = vectorizer.fit_transform(lemmed_features)

# Make a dataframe for machine learning
total_features = pd.DataFrame(vectored_features.toarray(), columns=vectorizer.get_feature_names_out())

print(total_features.shape)
print(target.shape)


(150000, 20402)
(150000,)


In [3]:
best_rf_model = RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200, n_jobs=-1)

rfecv = RFE(estimator=best_rf_model, step=1000, n_features_to_select=3000, verbose=2)

rfecv.fit(total_features, target)

Fitting estimator with 20402 features.
Fitting estimator with 19402 features.
Fitting estimator with 18402 features.
Fitting estimator with 17402 features.
Fitting estimator with 16402 features.
Fitting estimator with 15402 features.
Fitting estimator with 14402 features.
Fitting estimator with 13402 features.
Fitting estimator with 12402 features.
Fitting estimator with 11402 features.
Fitting estimator with 10402 features.
Fitting estimator with 9402 features.
Fitting estimator with 8402 features.
Fitting estimator with 7402 features.
Fitting estimator with 6402 features.
Fitting estimator with 5402 features.
Fitting estimator with 4402 features.
Fitting estimator with 3402 features.


In [4]:
rfecv.score(total_features, target)

0.9701866666666666

In [16]:
features01 = rfecv.support_
best_features01_df = total_features.loc[:, features01]
best_features01 = best_features01_df.columns.tolist()
print(len(best_features01))
print(type(best_features01))
print(best_features01[:8])

3000
<class 'list'>
['ability', 'able', 'absolute', 'absolutely', 'absorb', 'ac', 'accept', 'acceptable']


In [17]:


with open('Q1_best_features.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(best_features01)


In [2]:
df = pd.read_json("../data/Appliances.json", lines = True)
df = df.dropna(subset='reviewText')

vectorizer = CountVectorizer(stop_words='english')

features02 = df['reviewText'].iloc[150000:300000]
target02 = df['overall'].iloc[150000:300000]

lemmed_features02 = column_lemmatizer(features02)

vectored_features02 = vectorizer.fit_transform(lemmed_features02)

# Make a dataframe for machine learning
total_features02 = pd.DataFrame(vectored_features02.toarray(), columns=vectorizer.get_feature_names_out())

print(total_features02.shape)
print(target02.shape)

(150000, 19306)
(150000,)


In [3]:
best_rf_model = RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200, n_jobs=-1)

rfe02 = RFE(estimator=best_rf_model, step=1000, n_features_to_select=3000, verbose=2)

rfe02.fit(total_features02, target02)

Fitting estimator with 19306 features.
Fitting estimator with 18306 features.
Fitting estimator with 17306 features.
Fitting estimator with 16306 features.
Fitting estimator with 15306 features.
Fitting estimator with 14306 features.
Fitting estimator with 13306 features.
Fitting estimator with 12306 features.
Fitting estimator with 11306 features.
Fitting estimator with 10306 features.
Fitting estimator with 9306 features.
Fitting estimator with 8306 features.
Fitting estimator with 7306 features.
Fitting estimator with 6306 features.
Fitting estimator with 5306 features.
Fitting estimator with 4306 features.
Fitting estimator with 3306 features.


In [4]:
features02 = rfe02.support_
best_features02_df = total_features02.loc[:, features02]
best_features02 = best_features02_df.columns.tolist()
print(len(best_features02))
print(type(best_features02))
print(best_features02[:8])

with open('Q2_best_features.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(best_features02)


3000
<class 'list'>
['ability', 'able', 'absolute', 'absolutely', 'ac', 'accept', 'acceptable', 'access']


In [5]:
df = pd.read_json("../data/Appliances.json", lines = True)
df = df.dropna(subset='reviewText')

vectorizer = CountVectorizer(stop_words='english')

features03 = df['reviewText'].iloc[300000:450000]
target03 = df['overall'].iloc[300000:450000]

lemmed_features03 = column_lemmatizer(features03)

vectored_features03 = vectorizer.fit_transform(lemmed_features03)

# Make a dataframe for machine learning
total_features03 = pd.DataFrame(vectored_features03.toarray(), columns=vectorizer.get_feature_names_out())

print(total_features03.shape)
print(target03.shape)

(150000, 18086)
(150000,)


In [6]:
best_rf_model = RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200, n_jobs=-1)

rfe03 = RFE(estimator=best_rf_model, step=1000, n_features_to_select=3000, verbose=2)

rfe03.fit(total_features03, target03)

Fitting estimator with 18086 features.
Fitting estimator with 17086 features.
Fitting estimator with 16086 features.
Fitting estimator with 15086 features.
Fitting estimator with 14086 features.
Fitting estimator with 13086 features.
Fitting estimator with 12086 features.
Fitting estimator with 11086 features.
Fitting estimator with 10086 features.
Fitting estimator with 9086 features.
Fitting estimator with 8086 features.
Fitting estimator with 7086 features.
Fitting estimator with 6086 features.
Fitting estimator with 5086 features.
Fitting estimator with 4086 features.
Fitting estimator with 3086 features.


In [7]:
features03 = rfe03.support_
best_features03_df = total_features03.loc[:, features03]
best_features03 = best_features03_df.columns.tolist()
print(len(best_features03))
print(type(best_features03))
print(best_features03[:8])

with open('Q3_best_features.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(best_features03)


3000
<class 'list'>
['ability', 'able', 'absolute', 'absolutely', 'absorb', 'ac', 'accept', 'acceptable']


In [8]:
df = pd.read_json("../data/Appliances.json", lines = True)
df = df.dropna(subset='reviewText')

vectorizer = CountVectorizer(stop_words='english')

features04 = df['reviewText'].iloc[450000:]
target04 = df['overall'].iloc[450000:]

lemmed_features04 = column_lemmatizer(features04)

vectored_features04 = vectorizer.fit_transform(lemmed_features04)

# Make a dataframe for machine learning
total_features04 = pd.DataFrame(vectored_features04.toarray(), columns=vectorizer.get_feature_names_out())

print(total_features04.shape)
print(target04.shape)

(152453, 20936)
(152453,)


In [9]:
best_rf_model = RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200, n_jobs=-1)

rfe04 = RFE(estimator=best_rf_model, step=1000, n_features_to_select=3000, verbose=2)

rfe04.fit(total_features04, target04)

Fitting estimator with 20936 features.
Fitting estimator with 19936 features.
Fitting estimator with 18936 features.
Fitting estimator with 17936 features.
Fitting estimator with 16936 features.
Fitting estimator with 15936 features.
Fitting estimator with 14936 features.
Fitting estimator with 13936 features.
Fitting estimator with 12936 features.
Fitting estimator with 11936 features.
Fitting estimator with 10936 features.
Fitting estimator with 9936 features.
Fitting estimator with 8936 features.
Fitting estimator with 7936 features.
Fitting estimator with 6936 features.
Fitting estimator with 5936 features.
Fitting estimator with 4936 features.
Fitting estimator with 3936 features.


In [10]:
features04 = rfe04.support_
best_features04_df = total_features04.loc[:, features04]
best_features04 = best_features04_df.columns.tolist()
print(len(best_features04))
print(type(best_features04))
print(best_features04[:8])

with open('Q4_best_features.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(best_features04)


3000
<class 'list'>
['ability', 'able', 'absolute', 'absolutely', 'absorb', 'abuse', 'ac', 'accept']


In [1]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import csv

import sys

sys.path.append('../notebooks/Functions/')
from NewTextMiningProcesses import new_column_lemmatizer, new_column_stemmatizer, new_count_vectorize_data

df = pd.read_json("../data/Appliances.json", lines = True)
df = df.dropna(subset='reviewText')

vectorizer = CountVectorizer()

all_features = df['reviewText']
all_target = df['overall']

lemmed_best_features = new_column_lemmatizer(all_features)

vectored_best_features = vectorizer.fit_transform(lemmed_best_features)

# Make a dataframe for machine learning
total_best_features = pd.DataFrame(vectored_best_features.toarray(), columns=vectorizer.get_feature_names_out())

print(total_best_features.shape)
print(all_target.shape)

(602453, 3931)
(602453,)


In [2]:
lem_stem_4000_best_features = pd.concat([total_best_features, all_target], axis = 1)

print(lem_stem_4000_best_features.shape)

(602777, 3932)


In [3]:
lem_stem_4000_best_features.to_csv('../data/lem_stem_4000_best_features.csv', index=False, header=True)

In [5]:
del lemmed_best_features
del vectored_best_features

In [8]:
from sklearn.ensemble import RandomForestClassifier

best_rf_model = RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200, n_jobs=-1)

best_rfecv = RFECV(estimator=best_rf_model, step=1000, cv=StratifiedKFold(2), verbose=2)

best_rfecv.fit(total_best_features, all_target)

Fitting estimator with 3931 features.
Fitting estimator with 2931 features.
Fitting estimator with 1931 features.
Fitting estimator with 931 features.
Fitting estimator with 3931 features.
Fitting estimator with 2931 features.
Fitting estimator with 1931 features.
Fitting estimator with 931 features.


In [9]:
best_best_features = best_rfecv.support_
best_best_features_df = total_best_features.loc[:, best_best_features]
best_best_features_list = best_best_features_df.columns.tolist()
print(len(best_best_features_list))
print(type(best_best_features_list))
print(best_best_features_list[:8])

with open('best_best_features.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(best_best_features_list)

3931
<class 'list'>
['ability', 'able', 'absolute', 'absolutely', 'absorb', 'abuse', 'ac', 'accept']


In [14]:
print(best_best_features_list == list(total_best_features.columns))

best_rfecv.score(total_best_features, all_target)

0.9629913038859463