## use nlp for text analysis

In [2]:
import pandas as pd
#load data
data = pd.read_csv("data_trian.csv")
data.head()
#remove special characters
data['text']=data['text'].str.replace("[^a-zA-Z#]", " ")

In [3]:
import pandas as pd
import numpy as np
from sklearn import model_selection
train_x, test_x, train_y, test_y = model_selection.train_test_split(data['text'], data['class'],test_size = 0.2, random_state = 1)

In [4]:
print(train_x.shape, test_x.shape)
train_x.head()
test_x.head()

(9436,) (2360,)


8355     tourist trap  can t believe so many people fal...
5293     This was my first time there and I was not dis...
988      Frankly  I am torn between if I should give th...
4669     Yelp needs to update the address of this place...
11464    First time eating French food  Wow amazing hol...
Name: text, dtype: object

In [5]:
'''
 use TfidfVectorizer() and CountVectorizer() for feather extraction, and apply the feathers into different models
'''
#Use TF-IDF to extract features and vectorize sentences after word segmentation.
#import nltk package
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
TF_Vec=TfidfVectorizer(max_df=0.8,
                       min_df = 3,
                       stop_words=frozenset(stop_words)
                      )
#data fitting, transform the data into standard form (usually used in traning set)
train_x_tfvec=TF_Vec.fit_transform(train_x)
#Standardization through centralization and scaling (used in test sets)
test_x_tfvec=TF_Vec.transform(test_x)
 
#Start using CountVectorizer() for feature extraction. It transforms vectors according to the frequency of words. 
CT_Vec=CountVectorizer(max_df=0.8,#remove the words appear more than 80%
                       min_df = 3,#remove the words appear less than 3 times
                       token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b',#Use regular expressions 
                       stop_words=frozenset(stop_words))#add stop words
#data fitting, transform the data into standard form (usually used in traning set)
train_x_ctvec=CT_Vec.fit_transform(train_x)
#Standardization through centralization and scaling (used in test sets)
test_x_ctvec=CT_Vec.transform(test_x)



[nltk_data] Downloading package stopwords to /Users/karen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
'''
using TF_IDF to extracting data features 
'''
#import package
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import time
start_time=time.time()
#create model
lr = linear_model.LogisticRegression(penalty='l2', C=1, solver='liblinear', max_iter=1000, multi_class='ovr')
#Optimize the model, because some parameters are uncertain, so let the model determine its own parameters during training 
# The name of the model is also changed from LR to model
model = GridSearchCV(lr, cv=3, param_grid={
        'C': np.logspace(0, 4, 30), # logspace(a,b,n) generates n points between decades 10^a and 10^b.
        'penalty': ['l1', 'l2']
    })
#模型拟合tf-idf拿到的数据
model.fit(train_x_tfvec,train_y)
#find optimal parameter
print('optimal parameter：', model.best_params_)
#accuracy rate on train set before training
pre_train_y=model.predict(train_x_tfvec)
#accuracy score on train set
train_accracy=accuracy_score(pre_train_y,train_y)
#View predictions at the end of training 
pre_test_y=model.predict(test_x_tfvec)
#accuracy score on test set
test_accracy = accuracy_score(pre_test_y,test_y)
print('Use TF-IDF for feature extraction; use logistic regression to find the optimal model\ntraining set:{0}\ntest set:{1}'.format(train_accracy,test_accracy))
end_time=time.time()
print("the run time of optimized model",end_time-start_time)

optimal parameter： {'C': 1.0, 'penalty': 'l2'}
Use TF-IDF for feature extraction; use logistic regression to find the optimal model
training set:0.7928147520135651
test set:0.6826271186440678
the run time of optimized model 51.10105776786804


In [8]:
'''
use CountVectorizer to extracting data features 
'''
#import package
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import time
start_time=time.time()
#create model
lr = linear_model.LogisticRegression(penalty='l2', C=1, solver='liblinear', max_iter=1000, multi_class='ovr')

model = GridSearchCV(lr, cv=3, param_grid={
        'C': np.logspace(0, 4, 30),
        'penalty': ['l1', 'l2']
    })
#Model fitting using CountVectorizer
model.fit(train_x_ctvec,train_y)
#find optimal parameter
print('optimal parameter：', model.best_params_)
#check accuracy rate 
pre_train_y=model.predict(train_x_ctvec)

train_accracy=accuracy_score(pre_train_y,train_y)

pre_test_y=model.predict(test_x_ctvec)

test_accracy = accuracy_score(pre_test_y,test_y)
print('Use CountVectorizer for feather extraction, use logistic regression to find the optimal model\ntraining set:{0}\ntest set:{1}'.format(train_accracy,test_accracy))
end_time=time.time()
print("The runtime of optimized model",end_time-start_time)



optimal parameter： {'C': 1.0, 'penalty': 'l1'}
Use CountVectorizer for feather extraction, use logistic regression to find the optimal model
training set:0.869118270453582
test set:0.6538135593220339
The runtime of optimized model 59.386225938797


### use other machine learning models

##### 1. KNN

In [9]:
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
start_time=time.time()
#CREATE model
Kn = KNeighborsClassifier()
#fit model using the data from tf-idf
Kn.fit(train_x_tfvec,train_y)
pre_train_y=Kn.predict(train_x_tfvec)
train_accracy=accuracy_score(pre_train_y,train_y)
pre_test_y=Kn.predict(test_x_tfvec)
test_accracy = accuracy_score(pre_test_y,test_y)
print('Use TfidfVectorizer for feature extraction, use KNN classifier, the accuracy rate is:\ntraining set:{0}\ntest set:{1}'.format(train_accracy,test_accracy))
end_time=time.time()
print("The runtime of KNN classifier is",end_time-start_time)



Use TfidfVectorizer for feature extraction, use KNN classifier, the accuracy rate is:
training set:0.511551504874947
test set:0.4775423728813559
The runtime of KNN classifier is 7.416763782501221


##### 1.2 Random Forest Classifier

In [10]:
### Random Forest Classifier 
from sklearn.ensemble import RandomForestClassifier 
import time
start_time=time.time()
#create model
Rfc = RandomForestClassifier(n_estimators=100)
#fit the model using data from CounterfVectorizer
Rfc.fit(train_x_ctvec,train_y)
pre_train_y=Rfc.predict(train_x_ctvec)

train_accracy=accuracy_score(pre_train_y,train_y)

pre_test_y=Rfc.predict(test_x_ctvec)

test_accracy = accuracy_score(pre_test_y,test_y)
print('use CounterfVectorizer for feature extraction; use Random Forest Classifier; the accuracy rate is\ntraining set:{0}\ntest set:{1}'.format(train_accracy,test_accracy))
end_time=time.time()
print("The runtime of Random Forest Classififer is",end_time-start_time)

use CounterfVectorizer for feature extraction; use Random Forest Classifier; the accuracy rate is
training set:0.9992581602373887
test set:0.6813559322033899
The runtime of Random Forest Classififer is 10.78722596168518


##### 1.3 Decision Tree Classifier

In [11]:
#Decision Tree Classifier
from sklearn import tree
import time
start_time=time.time()
#create model
Rf = tree.DecisionTreeClassifier()
#fit using data from tf-idf
Rf.fit(train_x_tfvec,train_y)
pre_train_y=Rf.predict(train_x_tfvec)
train_accracy=accuracy_score(pre_train_y,train_y)
test_accracy = accuracy_score(pre_test_y,test_y)
print('use tffor feature extraction, use decision tree classifier, the accuracy is:\ntraining set:{0}\ntest set:{1}'.format(train_accracy,test_accracy))
end_time=time.time()
print("the runtime of decision tree classifier is:",end_time-start_time)

use tffor feature extraction, use decision tree classifier, the accuracy is:
training set:0.9992581602373887
test set:0.6813559322033899
the runtime of decision tree classifier is: 2.791985034942627


#### 1.4 Bayes model

In [12]:
from sklearn.naive_bayes import MultinomialNB
import time
start_time=time.time()
#create model
Bys = MultinomialNB()
Bys.fit(train_x_ctvec, train_y)
pre_train_y=Bys.predict(train_x_ctvec)
train_accracy=accuracy_score(pre_train_y,train_y)
pre_test_y=Bys.predict(test_x_ctvec)
test_accracy = accuracy_score(pre_test_y,test_y)
print('use CounterVectorizer for feature extraction, use Bayes Classifier, the accuracy rate is:\ntraining set:{0}\ntest set:{1}'.format(train_accracy,test_accracy))
end_time=time.time()
print("The runtime of Bayes Classifier is",end_time-start_time)

use CounterVectorizer for feature extraction, use Bayes Classifier, the accuracy rate is:
training set:0.7658965663416702
test set:0.6775423728813559
The runtime of Bayes Classifier is 0.05059003829956055


#### 1.5 SVM

In [13]:
from sklearn.svm import SVC
import time
start_time=time.time()
SVM = SVC(C=1.0, kernel='rbf', gamma='auto')
SVM.fit(train_x_ctvec, train_y)
pre_train_y=SVM.predict(train_x_ctvec)
train_accracy=accuracy_score(pre_train_y,train_y)
pre_test_y=SVM.predict(test_x_ctvec)
test_accracy = accuracy_score(pre_test_y,test_y)
print('Use CounterfVectorizer for feature extraction, use SVM Classifier, the accuracy rate is:\ntraining set:{0}\ntest set:{1}'.format(train_accracy,test_accracy))
end_time=time.time()
print("The runtime of SVM Classifier is:",end_time-start_time)

Use CounterfVectorizer for feature extraction, use SVM Classifier, the accuracy rate is:
training set:0.6409495548961425
test set:0.6161016949152542
The runtime of SVM Classifier is: 68.90573620796204


### Conclusion

The above results show that Decision Tree Classifier has the best performance over all models. The decision tree classifier has comparatively less runtime, but higher accuracy score on the test set.