In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
def clean_txt(txt_in):
    import re
    clean_str = re.sub("[^A-Za-z]+", " ", txt_in).strip().lower()
    return clean_str
def my_stop_words(var_in):
    from nltk.corpus import stopwords
    sw = stopwords.words('english')
    tmp = [word for word in var_in.split() if word not in sw]
    tmp = ' '.join(tmp)
    return tmp
def my_stem(var_in):
    from nltk.stem.porter import PorterStemmer
    my_stem = PorterStemmer()
    tmp = [my_stem.stem(word) for word in var_in.split()]
    tmp = ' '.join(tmp)
    return tmp

def my_bow(df_in, gram_m, gram_n, sw):
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfVectorizer
    import pandas as pd
    if sw == "tf-idf":
        my_cv = TfidfVectorizer(ngram_range=(gram_m, gram_n))
    else:
        my_cv = CountVectorizer(ngram_range=(gram_m, gram_n))
    my_cv_data = pd.DataFrame(my_cv.fit_transform(df_in).toarray())
    col_names = list(my_cv.vocabulary_.keys())
    my_cv_data.columns = col_names
    return my_cv_data,my_cv

def ml_helper(model,x,y,xt,yt,params,cv,sc):
    from sklearn.model_selection import GridSearchCV
    grid=GridSearchCV(model, param_grid=params, cv=cv, scoring=sc)
    grid.fit(x,y)
    print("best mean cv score {}".format(grid.best_score_))
    print("test score {}".format(grid.score(xt, yt)))
    print("best parameters {}".format(grid.best_params_))
    return None

def my_pca(df_in,exp_var_in):
    from sklearn.decomposition import PCA
    my_pca = PCA(n_components=exp_var_in)#, svd_solver='full')
    my_pca_data = my_pca.fit_transform(df_in)
    exp_var = sum(my_pca.explained_variance_ratio_)
    print ("Explained variance is:", exp_var)
    return my_pca_data, my_pca

def nlp_pipline(txt, vec, model, pca):
    cleaned_bd=clean_txt(txt)
    rem_sw=my_stop_words(cleaned_bd)
    stemmed=my_stem(rem_sw)
    vectorized=vec.transform([stemmed])
    reduced=pca.transform(vectorized.toarray())
    label=model.predict(reduced)
    
    if label[0]==0:
        lab="NEGATIVE"
    elif label[0]==1:
        lab="POSITIVE"
    else:
        lab="other"
    prob1=np.mean(model.predict_proba(reduced)[:,0])
    prob2=np.mean(model.predict_proba(reduced)[:,1])

    print("The class is",lab, "and the probability for negativity is",round(prob1,4), 
          "the probability for positivity is", round(prob2,4))

In [3]:
# change the path to yours
df=pd.read_csv(r"D:\CU\spring 2022\nlp\Roku\amazon_model_df.csv")

In [4]:
# 这里可以搞出了两个可以用在后面traning的dataset，一个是bow一个是tfidf，后面看那个效果好就用那个
# 另外，中间的1，3代表我们看1个单词或者3个单词组成的词组的情况，可以视情况而定要不要进行调整（1，3），（1，2）
bow_pack=my_bow(df["stemmed"], 1, 1, "bow")
tfidf_pack=my_bow(df["stemmed"], 1, 1, "tf-idf")
bow_df=bow_pack[0]
bow_cv=bow_pack[1]
tfidf_df=tfidf_pack[0]
tfidf_cv=tfidf_pack[1]


In [5]:
y=df["label"]

In [6]:
# 不用改什么
bow_pca_pack=my_pca(bow_df, 0.9)
tfidf_pca_pack=my_pca(tfidf_df, 0.9)
bow_pca=bow_pca_pack[0]
tfidf_pca=tfidf_pca_pack[0]
bow_pca_transformer=bow_pca_pack[1]
tfidf_pca_transformer=tfidf_pca_pack[1]

Explained variance is: 0.9000351048871951
Explained variance is: 0.9002337303352271


In [7]:
# you can the X= to bow_pca, tfidf_pca and run this every time you change the parameter above
X=bow_pca
X_train, X_test, y_train, y_test=train_test_split(X, y, random_state=42)

In [20]:
# model training
parameters={"max_depth":np.arange(22,24,1), "min_samples_leaf":np.arange(4,5,1),"n_estimators":np.arange(88,92,1) }
# change the numbers in the bracket

# this will print out the f1 score and the best parameters
ml_helper(RandomForestClassifier(random_state=42),X_train,y_train,X_test,y_test,parameters,5,"f1")

best mean cv score 0.7512247949082683
test score 0.8134453781512605
best parameters {'max_depth': 23, 'min_samples_leaf': 4, 'n_estimators': 90}


In [21]:
# fitting the best model, put the result from above to the respective place

best_rf=RandomForestClassifier(max_depth=23,min_samples_leaf=4,n_estimators=91,random_state=90)
# for the line above, change the number， 注意顺序填对
best_rf.fit(X_train, y_train)
y_pred=best_rf.predict(X_test)
print(classification_report(y_test, y_pred)) # the table printed below gives the accuracy of the model 

              precision    recall  f1-score   support

           0       0.81      0.77      0.79       267
           1       0.80      0.83      0.81       290

    accuracy                           0.80       557
   macro avg       0.80      0.80      0.80       557
weighted avg       0.80      0.80      0.80       557



In [25]:
txt_in="best device ever. stable connection and diverse channels. super good."
vec_in=bow_cv # 如果用的bow就是bow_cv, 如果是tfidf就是tfidf_cv
pca_in=bow_pca_transformer # 如果用的bow就是bow_pca_transformer, 如果是tfidf就是tfidf_pca_transformer

nlp_pipline(txt_in, vec_in, best_rf, pca_in)

The class is POSITIVE and the probability for negativity is 0.4889 the probability for positivity is 0.5111
