In [1]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import json
%matplotlib inline
import copy
pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 300

In [2]:
import pandas as pd

#col_names = ['url','urlid','boilerplate','alchemy_category','alchemy_category_score','avglinksize','commonlinkratio_1','commonlinkratio_2','commonlinkratio_3','commonlinkratio_4','compression_ratio','embed_ratio','framebased','frameTagRatio','hasDomainLink','html_ratio','image_ratio','is_news','lengthyLinkDomain','linkwordscore','news_front_page','non_markup_alphanum_characters','numberOfLinks','numwords_in_url','parametrizedLinkRatio','spelling_errors_ratio','label']
train_data = pd.read_csv('train.tsv', sep='\t')

In [3]:
train_data.shape

(7395, 27)

## Predicting "Greenness" Of Content

This dataset comes from [stumbleupon](https://www.stumbleupon.com/), a web page recommender and was made available [here](https://www.kaggle.com/c/stumbleupon/download/train.tsv)

A description of the columns is below

FieldName|Type|Description
---------|----|-----------
url|string|Url of the webpage to be classified
urlid|integer| StumbleUpon's unique identifier for each url
boilerplate|json|Boilerplate text
alchemy_category|string|Alchemy category (per the publicly available Alchemy API found at www.alchemyapi.com)
alchemy_category_score|double|Alchemy category score (per the publicly available Alchemy API found at www.alchemyapi.com)
avglinksize| double|Average number of words in each link
commonLinkRatio_1|double|# of links sharing at least 1 word with 1 other links / # of links
commonLinkRatio_2|double|# of links sharing at least 1 word with 2 other links / # of links
commonLinkRatio_3|double|# of links sharing at least 1 word with 3 other links / # of links
commonLinkRatio_4|double|# of links sharing at least 1 word with 4 other links / # of links
compression_ratio|double|Compression achieved on this page via gzip (measure of redundancy)
embed_ratio|double|Count of number of <embed> usage
frameBased|integer (0 or 1)|A page is frame-based (1) if it has no body markup but have a frameset markup
frameTagRatio|double|Ratio of iframe markups over total number of markups
hasDomainLink|integer (0 or 1)|True (1) if it contains an <a> with an url with domain
html_ratio|double|Ratio of tags vs text in the page
image_ratio|double|Ratio of <img> tags vs text in the page
is_news|integer (0 or 1) | True (1) if StumbleUpon's news classifier determines that this webpage is news
lengthyLinkDomain| integer (0 or 1)|True (1) if at least 3 <a> 's text contains more than 30 alphanumeric characters
linkwordscore|double|Percentage of words on the page that are in hyperlink's text
news_front_page| integer (0 or 1)|True (1) if StumbleUpon's news classifier determines that this webpage is front-page news
non_markup_alphanum_characters|integer| Page's text's number of alphanumeric characters
numberOfLinks|integer Number of <a>|markups
numwords_in_url| double|Number of words in url
parametrizedLinkRatio|double|A link is parametrized if it's url contains parameters or has an attached onClick event
spelling_errors_ratio|double|Ratio of words not found in wiki (considered to be a spelling mistake)
label|integer (0 or 1)|User-determined label. Either evergreen (1) or non-evergreen (0); available for train.tsv only

### What are 'evergreen' sites?
- These are websites that always relevant like recipes or reviews (as opposed to current events)
- Look at some examples

In [4]:
data = pd.read_csv('train.tsv', sep='\t', na_values={'is_news' : '?'}).fillna(0)

# Extract the title and body from the boilerplate JSON text
data['title'] = data.boilerplate.map(lambda x: json.loads(x).get('title', ''))
data['body'] = data.boilerplate.map(lambda x: json.loads(x).get('body', ''))

In [5]:
data.shape

(7395, 29)

In [7]:
data[['title', 'label']].head()

Unnamed: 0,title,label
0,"IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries",0
1,"The Fully Electronic Futuristic Starting Gun That Eliminates Advantages in Races the fully electronic, futuristic starting gun that eliminates advantages in races the fully electronic, futuristic starting gun that eliminates advantages in races",1
2,Fruits that Fight the Flu fruits that fight the flu | cold & flu | men's health,1
3,10 Foolproof Tips for Better Sleep,1
4,The 50 Coolest Jerseys You Didn t Know Existed coolest jerseys you haven't seen,0


In [8]:
org_data = copy.deepcopy(data)

#### Does being a news site effect green-ness?

In [104]:
import statsmodels.formula.api as sm
model = sm.logit(
    "label ~is_news",
    data = data
).fit()

Optimization terminated successfully.
         Current function value: 0.692751
         Iterations 3


In [106]:
model.summary()

0,1,2,3
Dep. Variable:,label,No. Observations:,7395.0
Model:,Logit,Df Residuals:,7393.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 05 Dec 2016",Pseudo R-squ.:,5.98e-05
Time:,17:03:11,Log-Likelihood:,-5122.9
converged:,True,LL-Null:,-5123.2
,,LLR p-value:,0.4337

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
Intercept,0.0303,0.038,0.806,0.420,-0.043 0.104
is_news,0.0374,0.048,0.783,0.434,-0.056 0.131


#### Does the website category effect green-ness?

In [121]:
data = org_data

In [122]:
data['alchemy_category_score']
new_data  = data.loc[data['alchemy_category_score'] != '?']

In [123]:
new_data.shape

(5053, 29)

In [124]:
import statsmodels.formula.api as sm
model = sm.logit(
    "label ~ alchemy_category",
    data = new_data
).fit()

         Current function value: 0.629204
         Iterations: 35




In [125]:
model.summary()

0,1,2,3
Dep. Variable:,label,No. Observations:,5053.0
Model:,Logit,Df Residuals:,5040.0
Method:,MLE,Df Model:,12.0
Date:,"Mon, 05 Dec 2016",Pseudo R-squ.:,0.09135
Time:,17:11:25,Log-Likelihood:,-3179.4
converged:,False,LL-Null:,-3499.0
,,LLR p-value:,4.273e-129

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
Intercept,-0.5239,0.067,-7.767,0.000,-0.656 -0.392
alchemy_category[T.business],1.4259,0.100,14.200,0.000,1.229 1.623
alchemy_category[T.computer_internet],-0.5928,0.151,-3.932,0.000,-0.888 -0.297
alchemy_category[T.culture_politics],0.3544,0.128,2.776,0.006,0.104 0.605
alchemy_category[T.gaming],-0.0151,0.247,-0.061,0.951,-0.500 0.469
alchemy_category[T.health],0.8185,0.112,7.284,0.000,0.598 1.039
alchemy_category[T.law_crime],0.1985,0.370,0.536,0.592,-0.527 0.924
alchemy_category[T.recreation],1.2975,0.091,14.228,0.000,1.119 1.476
alchemy_category[T.religion],0.1874,0.248,0.755,0.451,-0.299 0.674


#### Does the image ratio effect green-ness?

In [128]:
data = org_data

In [129]:
data['image_ratio']
new_data  = data.loc[data['image_ratio'] > 0]

0       0.003883
1       0.088652
2       0.120536
3       0.035343
4       0.050473
5       0.038636
6       0.311377
7       0.025830
8       1.136646
9       0.206262
10      0.511364
11      0.060976
12      0.084112
13     -1.000000
14      0.036424
15      0.075630
16      0.276316
17      0.275862
18      0.091549
19      0.053254
20      0.693182
21      0.863636
22      0.062963
23      0.008274
24     -1.000000
25      0.085106
26      0.004608
27      0.004007
28      0.166667
29      1.184874
          ...   
7365    0.352941
7366    0.007758
7367   -1.000000
7368    0.126582
7369    0.031048
7370    0.313725
7371    0.129808
7372    0.172535
7373    0.006561
7374    0.363636
7375    0.043829
7376    0.026966
7377    0.112288
7378    0.278689
7379    0.067227
7380   -1.000000
7381    0.461538
7382    0.023490
7383   -1.000000
7384    0.165577
7385    3.083333
7386    0.003122
7387    0.137500
7388    0.274194
7389    1.666667
7390    0.048780
7391    0.225962
7392    0.4642

In [119]:
import statsmodels.formula.api as sm
model = sm.logit(
    "label ~ alchemy_category",
    data = new_data
).fit()

Optimization terminated successfully.
         Current function value: 0.643885
         Iterations 31


In [120]:
model.summary()

0,1,2,3
Dep. Variable:,label,No. Observations:,6665.0
Model:,Logit,Df Residuals:,6651.0
Method:,MLE,Df Model:,13.0
Date:,"Mon, 05 Dec 2016",Pseudo R-squ.:,0.06968
Time:,17:10:43,Log-Likelihood:,-4291.5
converged:,True,LL-Null:,-4612.9
,,LLR p-value:,5.458e-129

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
Intercept,0.0476,0.046,1.035,0.301,-0.043 0.138
alchemy_category[T.arts_entertainment],-0.5747,0.083,-6.900,0.000,-0.738 -0.411
alchemy_category[T.business],0.8964,0.090,10.005,0.000,0.721 1.072
alchemy_category[T.computer_internet],-1.1510,0.145,-7.916,0.000,-1.436 -0.866
alchemy_category[T.culture_politics],-0.1523,0.120,-1.267,0.205,-0.388 0.083
alchemy_category[T.gaming],-0.5831,0.261,-2.233,0.026,-1.095 -0.071
alchemy_category[T.health],0.2704,0.103,2.623,0.009,0.068 0.472
alchemy_category[T.law_crime],-0.5176,0.406,-1.276,0.202,-1.313 0.278
alchemy_category[T.recreation],0.8120,0.079,10.278,0.000,0.657 0.967


#### Fit a logistic regression model using statsmodels
- Test different features that may be valuable
- Examine the coefficients, does the feature increase or decrease the effect of being evergreen?

In [143]:
# TODO
data = org_data

In [144]:
data['avglinksize']
new_data  = data.loc[data['avglinksize'] > 0]

In [145]:
import statsmodels.formula.api as sm
model = sm.logit(
    "label ~ avglinksize",
    data = new_data
).fit()

Optimization terminated successfully.
         Current function value: 0.692738
         Iterations 4


In [146]:
model.summary()

0,1,2,3
Dep. Variable:,label,No. Observations:,7354.0
Model:,Logit,Df Residuals:,7352.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 05 Dec 2016",Pseudo R-squ.:,2.403e-05
Time:,17:24:04,Log-Likelihood:,-5094.4
converged:,True,LL-Null:,-5094.5
,,LLR p-value:,0.6208

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
Intercept,0.0523,0.025,2.131,0.033,0.004 0.100
avglinksize,0.0013,0.003,0.491,0.623,-0.004 0.007


In [147]:
data = org_data

In [148]:
data['spelling_errors_ratio']
new_data  = data.loc[data['spelling_errors_ratio'] > 0]

In [149]:
import statsmodels.formula.api as sm
model = sm.logit(
    "label ~ spelling_errors_ratio",
    data = new_data
).fit()

Optimization terminated successfully.
         Current function value: 0.690010
         Iterations 5


In [150]:
model.summary()

0,1,2,3
Dep. Variable:,label,No. Observations:,7235.0
Model:,Logit,Df Residuals:,7233.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 05 Dec 2016",Pseudo R-squ.:,0.003688
Time:,17:25:56,Log-Likelihood:,-4992.2
converged:,True,LL-Null:,-5010.7
,,LLR p-value:,1.209e-09

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
Intercept,0.2665,0.041,6.428,0.000,0.185 0.348
spelling_errors_ratio,-1.9261,0.335,-5.751,0.000,-2.583 -1.270


#### Fit a logistic regression model using statsmodels with text features
- Add text features that may be useful, add this to the model and see if they improve the fit
- Examine the coefficients, does the feature increase or decrease the effect of being evergreen?

In [169]:
data = org_data

In [170]:
# EXAMPLE text feature 'recipe'

data['is_recipe'] = data['title'].fillna('').str.contains('chicken')

In [165]:
data.shape

(7395, 30)

In [168]:
data.head(0)

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,compression_ratio,embed_ratio,framebased,frameTagRatio,hasDomainLink,html_ratio,image_ratio,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label,title,body,is_recipe


In [171]:
import statsmodels.formula.api as sm
model = sm.logit(
    " label ~ is_recipe",
    data = data
).fit()

Optimization terminated successfully.
         Current function value: 0.688092
         Iterations 7


In [172]:
model.summary()

0,1,2,3
Dep. Variable:,label,No. Observations:,7395.0
Model:,Logit,Df Residuals:,7393.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 05 Dec 2016",Pseudo R-squ.:,0.006785
Time:,17:54:27,Log-Likelihood:,-5088.4
converged:,True,LL-Null:,-5123.2
,,LLR p-value:,7.56e-17

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
Intercept,0.0371,0.023,1.588,0.112,-0.009 0.083
is_recipe[T.True],3.4129,0.719,4.749,0.000,2.004 4.821
