In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.ensemble import VotingClassifier

In [2]:
data = pd.read_csv('classification_data.csv')
data.rename(columns={'tweet_text_x': 'tweet_text'}, inplace=True)
data.head()

Unnamed: 0,tweet_id,tweet_text,text_info
0,917791130590183424,PHOTOS: Deadly wildfires rage in California ht...,informative
1,917791044158185473,RT @Gizmodo: Wildfires raging through Northern...,informative
2,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,informative
3,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,informative
4,917792092100988929,RT @TIME: California's raging wildfires as you...,informative


In [3]:
data = data.dropna(axis=0)

In [4]:
data['text_info'] = data['text_info'].apply(lambda x: 1 if (x == "informative")  else 0)

In [5]:
data.head()

Unnamed: 0,tweet_id,tweet_text,text_info
0,917791130590183424,PHOTOS: Deadly wildfires rage in California ht...,1
1,917791044158185473,RT @Gizmodo: Wildfires raging through Northern...,1
2,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,1
3,917791291823591425,RT @Cal_OES: PLS SHARE: We’re capturing wildfi...,1
4,917792092100988929,RT @TIME: California's raging wildfires as you...,1


In [6]:
X_train, X_test, y_train, y_test = train_test_split(data['tweet_text'], data['text_info'], test_size=0.3)

In [7]:
vectorizer = CountVectorizer()

In [8]:
X = vectorizer.fit_transform(X_train).toarray()

In [9]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [10]:
estimators = []

In [11]:
estimators.append(('lgr', LogisticRegression()))

In [12]:
estimators.append(('rf', RandomForestClassifier(n_estimators=500,
                                 max_features=0.25,
                                 criterion="entropy",
                                 class_weight="balanced")))

In [13]:
estimators.append(('cart', DecisionTreeClassifier()))

In [14]:
estimators.append(('svc', SVC()))

In [15]:
ensemble = VotingClassifier(estimators, weights=[2, 1, 1, 2])
ensemble.fit(X, y_train)

In [16]:
pred = ensemble.predict(vectorizer.transform(X_test).toarray())
accuracy = metrics.accuracy_score(y_test, pred)
precisions, recall, f1_score, _ = metrics.precision_recall_fscore_support(y_test, pred)
   

In [17]:
accuracy

0.8050314465408805

In [18]:
precisions, recall

(array([0.64285714, 0.81514477]), array([0.17821782, 0.97340426]))

In [19]:
f1_score

array([0.27906977, 0.88727273])

In [20]:
clf = LogisticRegression()
# clf = RandomForestClassifier(n_estimators=500, max_features=0.25, criterion="entropy", class_weight="balanced")
# clf = DecisionTreeClassifier()
# clf = SVC()
clf.fit(X, y_train)

In [21]:
lgr_pred = clf.predict(vectorizer.transform(X_test).toarray())
metrics.accuracy_score(y_test, lgr_pred)

0.80083857442348