In [11]:
from nltk.corpus import stopwords
from collections import defaultdict
import re
import pandas as pd
import numpy as np
from gensim import corpora
import nltk
nltk.download('stopwords')

stoplist = stopwords.words('english')

#Set path and read data
base_dir = 'IMDB/train/'
pos_texts = open(base_dir + 'imdb_train_pos.txt').readlines()
neg_texts = open(base_dir + 'imdb_train_neg.txt').readlines()

#Data preprocessing functions
def preprocess(texts):
    texts = [re.sub("[\s+\.\!\/_,$%^*(+\"\?:)<>]+|[+——！，""。？、~@#￥%……&*（）]", " ",text) for text in texts]
    texts = [
        [word for word in document.lower().split() if word not in stoplist]
        for document in texts]
    
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1
            
    texts = [
        [token for token in text if frequency[token] > 10]
        for text in texts]
    
    return texts

In [12]:
pos_texts = preprocess(pos_texts)
neg_texts = preprocess(neg_texts)

#After dividing the document into words, use corpora.Dictionary to generate a dictionary.
pos_dictionary = corpora.Dictionary(pos_texts)
neg_dictionary = corpora.Dictionary(neg_texts)
#record keys and values of the dictionary
df_pos = pd.DataFrame([pos_dictionary.cfs.keys(),pos_dictionary.cfs.values()]).T
df_neg = pd.DataFrame([neg_dictionary.cfs.keys(),neg_dictionary.cfs.values()]).T

#Name the columns for keys and valus
df_pos.columns = ['pos_key','pos_freq']
df_neg.columns = ['neg_key','neg_freq']

df_pos['word'] = df_pos['pos_key'].apply(lambda x:pos_dictionary[x])
df_neg['word'] = df_neg['neg_key'].apply(lambda x:neg_dictionary[x])
#Connect data to df
df = pd.merge(df_pos,df_neg,on=['word','word'],how='inner')
df['dif'] = df['pos_freq']/df['neg_freq']

df = df.sort_values(by = ['dif'],ascending=False).reset_index(drop=True)
df['score'] = np.log(df['dif'])

In [13]:
# construct features that only rely on train data
N = 100
pos_words = list(df['word'][:N])
neg_words = list(df['word'][len(df)-N:])
mid_words = list(df['word'][int((len(df)/2)-(N/2)):int((len(df)/2)+(N/2))])
all_words = list(df['word'])
word_score = dict(zip(list(df['word']),list(df['score'])))

In [14]:
scores = []
pos_counts = []
neg_counts = []
y = []

#Feature 1 calculate the average of all word scores
#Feature 2 pos_words: count the top 100 words
#Feature 3 neg_words: count the last 100 words
for text in pos_texts:
    inter_words = list(set(text).intersection(set(all_words)))
    inter_pos = list(set(text).intersection(set(pos_words)))
    inter_neg = list(set(text).intersection(set(neg_words)))
    if len(inter_words) > 0:
        score = np.mean([word_score[inter_words[i]] for i in range(len(inter_words))])
    else:
        score = 0
    scores.append(score)
    pos_counts.append(len(inter_pos))
    neg_counts.append(len(inter_neg))
    y.append(1)
    
for text in neg_texts:
    inter_words = list(set(text).intersection(set(all_words)))
    inter_pos = list(set(text).intersection(set(pos_words)))
    inter_neg = list(set(text).intersection(set(neg_words)))
    if len(inter_words) > 0:
        score = np.mean([word_score[inter_words[i]] for i in range(len(inter_words))])
    else:
        score = 0
    scores.append(score)
    pos_counts.append(len(inter_pos))
    neg_counts.append(len(inter_neg))
    y.append(0)

In [15]:
train_data = pd.DataFrame([scores,pos_counts,neg_counts,y]).T
train_data.columns = ['score','pos_count','neg_count','label']
train_data = train_data.sample(frac=1)
X_train = train_data.drop(['label'],axis=1)
y_train = train_data['label']

In [16]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
#Using the Gradient Boosting Decison Tree(GBDT) algorithm in sklearn to train model
model = GradientBoostingClassifier()
model.fit(X_train,y_train)

print(classification_report(y_train,model.predict(X_train)))

              precision    recall  f1-score   support

         0.0       0.89      0.87      0.88      7517
         1.0       0.87      0.89      0.88      7483

    accuracy                           0.88     15000
   macro avg       0.88      0.88      0.88     15000
weighted avg       0.88      0.88      0.88     15000



In [17]:
#Use the same method to process the test data and enter it into the trained model for testing
base_dir = 'IMDB/test/'
pos_texts = open(base_dir + 'imdb_test_pos.txt').readlines()
neg_texts = open(base_dir + 'imdb_test_neg.txt').readlines()

pos_texts = preprocess(pos_texts)
neg_texts = preprocess(neg_texts)

In [18]:
scores = []
pos_counts = []
neg_counts = []
y = []
for text in pos_texts:
    inter_words = list(set(text).intersection(set(all_words)))
    inter_pos = list(set(text).intersection(set(pos_words)))
    inter_neg = list(set(text).intersection(set(neg_words)))
    if len(inter_words) > 0:
        score = np.mean([word_score[inter_words[i]] for i in range(len(inter_words))])
    else:
        score = 0
    scores.append(score)
    pos_counts.append(len(inter_pos))
    neg_counts.append(len(inter_neg))
    y.append(1)
    
for text in neg_texts:
    inter_words = list(set(text).intersection(set(all_words)))
    inter_pos = list(set(text).intersection(set(pos_words)))
    inter_neg = list(set(text).intersection(set(neg_words)))
    if len(inter_words) > 0:
        score = np.mean([word_score[inter_words[i]] for i in range(len(inter_words))])
    else:
        score = 0
    scores.append(score)
    pos_counts.append(len(inter_pos))
    neg_counts.append(len(inter_neg))
    y.append(0)
    
test_data = pd.DataFrame([scores,pos_counts,neg_counts,y]).T
test_data.columns = ['score','pos_count','neg_count','label']
test_data = test_data.sample(frac=1)
X_test = test_data.drop(['label'],axis=1)
y_test = test_data['label']

In [28]:
print(classification_report(y_test,model.predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.90      0.88      0.89      2501
         1.0       0.88      0.90      0.89      2499

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000



[nltk_data] Downloading package stopwords to /Users/craig/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True