In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor

In [2]:
ds_demo = pd.read_csv("twitter_income.csv")
ds_demo.head(5)

Unnamed: 0,user_id,age,gender:female_gt_0_5,anxious:agree,anxious:strongly_agree,anxious:disagree,anxious:strongly_disagree,anxious:neither,children:no,children:yes,...,Topic_191,Topic_192,Topic_193,Topic_194,Topic_195,Topic_196,Topic_197,Topic_198,Topic_199,Topic_200
0,1103,30.700773,-0.050586,0.113558,0.034757,0.329889,0.22123,0.300567,0.682671,0.317329,...,0.143927,0.00107,0.00107,0.006956,0.042804,0.036918,0.019797,0.0,0.093098,0.010166
1,8273,29.817867,0.376868,0.079798,0.030549,0.360824,0.285467,0.243363,0.69313,0.30687,...,0.164856,0.0,0.001863,0.004967,0.05371,0.009624,0.005278,0.000621,0.066749,0.007762
2,9269,31.260406,0.204542,0.079508,0.026907,0.343969,0.273264,0.276351,0.835034,0.164966,...,0.136182,0.0,0.00123,0.005841,0.014448,0.071011,0.004919,0.010452,0.043345,0.0083
3,10571,33.120257,0.164126,0.093635,0.03756,0.410242,0.185529,0.273034,0.837154,0.162846,...,0.128398,0.0,0.002313,0.018508,0.020821,0.02487,0.010989,0.002313,0.07461,0.015616
4,11758,26.284088,-0.066739,0.103949,0.038573,0.239573,0.237358,0.380548,0.741889,0.258111,...,0.085206,0.000468,0.001404,0.004682,0.010768,0.020599,0.004682,0.001404,0.042603,0.015918


In [3]:
ds_demo.columns.get_loc("mean_income")

84

### Decision Tree Feature Selection(Top 20)

In [4]:
from sklearn.model_selection import train_test_split
y = pd.cut(ds_demo.loc[:,"mean_income"], bins = 2, labels = list(range(0,2)))
X = ds_demo.iloc[:,np.r_[1:84, 85:284]]
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=999)

In [5]:
X_train.head()

Unnamed: 0,age,gender:female_gt_0_5,anxious:agree,anxious:strongly_agree,anxious:disagree,anxious:strongly_disagree,anxious:neither,children:no,children:yes,education:degree,...,Topic_190,Topic_191,Topic_192,Topic_193,Topic_194,Topic_195,Topic_196,Topic_197,Topic_198,Topic_199
1550,34.224636,0.415983,0.079684,0.029174,0.307664,0.268543,0.314935,0.711415,0.288585,0.439273,...,0.023898,0.209808,0.000621,0.004035,0.019553,0.041899,0.002793,0.011484,0.000931,0.115456
327,26.626526,0.078611,0.106335,0.040307,0.415485,0.19448,0.243393,0.76666,0.23334,0.328166,...,0.021853,0.124466,0.001425,0.001425,0.005701,0.026128,0.010451,0.006651,0.0019,0.066508
4051,32.860484,0.813067,0.108011,0.031629,0.404499,0.222797,0.233064,0.788085,0.211915,0.323776,...,0.011811,0.094488,0.0,0.0,0.015748,0.027559,0.003937,0.0,0.003937,0.043307
4800,43.458246,0.312272,0.069568,0.022798,0.43133,0.21358,0.262723,0.766906,0.233094,0.352064,...,0.036689,0.080205,0.0,0.001706,0.127133,0.003413,0.023891,0.021331,0.005119,0.08959
4006,27.716917,0.781153,0.093069,0.033794,0.354316,0.260325,0.258497,0.750471,0.249529,0.416749,...,0.006714,0.140985,0.002238,0.004156,0.01055,0.028453,0.002558,0.006394,0.001279,0.070972


In [6]:
from sklearn.ensemble import RandomForestClassifier
# learn model
dt = RandomForestClassifier(max_depth = 8)
dt.fit(X_train,y_train)

# in sample accuracy
print ('In sample accuracy:',dt.score(X_train,y_train))

# out of sample accuracy
print ('Out of sample accuracy:',dt.score(X_test,y_test))

In sample accuracy: 0.9674855491329479
Out of sample accuracy: 0.9422521655437921


In [31]:
# This time we'll use max_depth to control the complexity of the tree, still using the same train/test split as above,
# and optimize the parameter value using GridSearchCV.
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth':range(1,10)}
dt = RandomForestRegressor()
gr = GridSearchCV(dt,param_grid=param_grid)
rs=gr.fit(X_train,y_train)
print (rs.best_params_)
# print (roc_auc_score(np.array(y_test),rs.predict_proba(X_test)[:,1]))

{'max_depth': 7}


### Feature Selection


In [32]:
dt = RandomForestClassifier(max_depth = 7)
dt.fit(X_train, y_train)
Feature_importance=pd.DataFrame([list(X_train.columns),list(dt.feature_importances_)]).T
Feature_importance.columns=["variables","importance"]

# list the top 5 most important features in order
df_select = Feature_importance.sort_values(by="importance",ascending=False).iloc[:20,:]

In [33]:
df_select.head(20)

Unnamed: 0,variables,importance
14,Topic_95,0.109644
3,Topic_139,0.102721
9,age,0.0756366
2,Topic_162,0.0716667
10,Topic_158,0.0675852
0,Topic_124,0.0539991
8,Topic_32,0.0513896
18,urls,0.0471569
12,Topic_56,0.046097
1,Topic_19,0.045892


In [34]:
top_20 = list(df_select.variables)

In [35]:
RF = top_20

In [36]:
df_RF = ds_demo[RF]
df_RF.head()

Unnamed: 0,Topic_95,Topic_139,age,Topic_162,Topic_158,Topic_124,Topic_32,urls,Topic_56,Topic_19,Topic_180,Topic_40,Topic_64,Topic_18,Topic_24,Topic_1,followers,retweeted,Topic_75,political:unafiliated
0,0.025682,0.023007,30.700773,0.079187,0.113965,0.031033,0.017657,0.333868,0.037453,0.010166,0.018727,0.038523,0.017657,0.002675,0.039593,0.000535,2358.0,0.229535,0.001605,0.44422
1,0.008072,0.007451,29.817867,0.069854,0.015523,0.005899,0.002794,0.286867,0.048432,0.005899,0.002484,0.017075,0.004657,0.009935,0.010556,0.000621,3761.0,0.191866,0.002173,0.418509
2,0.013833,0.00953,31.260406,0.061789,0.055641,0.025208,0.011067,0.755303,0.069167,0.005533,0.007685,0.043959,0.005226,0.0,0.031048,0.031048,11089.0,0.372579,0.01537,0.470566
3,0.00694,0.038751,33.120257,0.037016,0.018508,0.00694,0.008097,0.249277,0.027183,0.013302,0.012146,0.042799,0.006362,0.002892,0.008676,0.000578,563.0,0.041064,0.009832,0.443951
4,0.005618,0.007959,26.284088,0.032303,0.016386,0.009831,0.004682,0.536517,0.022004,0.007959,0.007022,0.016854,0.006554,0.001404,0.018258,0.001404,277.0,0.043071,0.002809,0.374156


In [37]:
Y = pd.cut(ds_demo.loc[:,"mean_income"], bins = 10, labels = list(range(0,10)))
Y.head()

0    3
1    3
2    3
3    0
4    2
Name: mean_income, dtype: category
Categories (10, int64): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]

### Modeling 

In [38]:
X_train, X_test, y_train, y_test = train_test_split(
    df_RF, Y, test_size=0.1, random_state=0)

In [39]:
dt = RandomForestClassifier(max_depth = 5)
dt.fit(X_train, y_train)

pred = dt.predict(X_test)
print(dt.score)
# Feature_importance=pd.DataFrame([list(X_train.columns),list(dt.feature_importances_)]).T
# Feature_importance.columns=["variables","importance"]

# # list the top 5 most important features in order
# df_select = Feature_importance.sort_values(by="importance",ascending=False).iloc[:20,:]

<bound method ClassifierMixin.score of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)>


In [40]:
pred

array([1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 3, 1, 1, 3,
       1, 1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2,
       1, 0, 2, 1, 1, 1, 2, 1, 2, 1, 3, 1, 2, 1, 1, 2, 1, 9, 1, 1, 1, 2,
       1, 1, 9, 2, 1, 1, 1, 1, 3, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 3, 1,
       1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 1,
       2, 1, 1, 9, 1, 2, 2, 1, 1, 1, 2, 0, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2,
       1, 1, 1, 1, 3, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2,
       1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 3,
       1, 2, 1, 3, 1, 1, 1, 1, 1, 9, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2,
       1, 1, 1, 1, 2, 1, 1, 1, 2, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1,
       1, 1, 2, 2, 3, 2, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1,
       1, 2, 1, 1, 9, 2, 1, 2, 1, 1, 2, 1, 2, 1, 1,

In [41]:
dt.score(X_test,y_test)

0.4423076923076923

In [42]:
RF

['Topic_95',
 'Topic_139',
 'age',
 'Topic_162',
 'Topic_158',
 'Topic_124',
 'Topic_32',
 'urls',
 'Topic_56',
 'Topic_19',
 'Topic_180',
 'Topic_40',
 'Topic_64',
 'Topic_18',
 'Topic_24',
 'Topic_1',
 'followers',
 'retweeted',
 'Topic_75',
 'political:unafiliated']

In [43]:
from sklearn.model_selection import cross_val_score, cross_validate

In [44]:
scores = cross_val_score(dt, df_RF, Y, cv = 10)

In [45]:
np.mean(scores)

0.42324633560504965