In [17]:
import pandas as pd
import statsmodels.formula.api as smf
df = pd.read_csv('http://cssbook.net/d/mediause.csv')
model = smf.ols(formula = 'newspaper ~ age + gender', data = df).fit()
# model.summary() would give a lot more info, but we only care about the coefficients:
model.params

Intercept   -0.089560
age          0.067620
gender       0.176665
dtype: float64

In [19]:
model.summary()

0,1,2,3
Dep. Variable:,newspaper,R-squared:,0.19
Model:,OLS,Adj. R-squared:,0.19
Method:,Least Squares,F-statistic:,244.4
Date:,"Thu, 12 Mar 2020",Prob (F-statistic):,4.76e-96
Time:,10:47:24,Log-Likelihood:,-4914.2
No. Observations:,2081,AIC:,9834.0
Df Residuals:,2078,BIC:,9851.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0896,0.159,-0.564,0.573,-0.401,0.222
age,0.0676,0.003,21.872,0.000,0.062,0.074
gender,0.1767,0.113,1.563,0.118,-0.045,0.398

0,1,2,3
Omnibus:,671.643,Durbin-Watson:,1.951
Prob(Omnibus):,0.0,Jarque-Bera (JB):,101.096
Skew:,-0.082,Prob(JB):,1.11e-22
Kurtosis:,1.933,Cond. No.,145.0


In [2]:
newdata = pd.DataFrame([{'gender':1, 'age':20}, {'gender': 0, 'age':40} ])
model.predict(newdata)

0    1.439508
1    2.615248
dtype: float64

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('http://cssbook.net/d/mediause.csv')

df['uses-internet'] = df['internet']>0
df.dropna(inplace=True)
print("How many people used online news at all?")
print(df['uses-internet'].value_counts())

X_train, X_test, y_train, y_test = train_test_split(df[['age', 'education', 'gender']], df['uses-internet'], test_size=0.2, random_state=42)

print('We have {} training and {} test cases.'.format(len(X_train), len(X_test)))

How many people used online news at all?
True     1262
False     803
Name: uses-internet, dtype: int64
We have 1652 training and 413 test cases.


In [20]:
from sklearn.naive_bayes import GaussianNB


myclassifier = GaussianNB()
myclassifier.fit(X_train, y_train)

y_pred = myclassifier.predict(X_test)

In [21]:
from sklearn.metrics import confusion_matrix, classification_report

print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Confusion matrix:
[[ 55 106]
 [ 40 212]]
              precision    recall  f1-score   support

       False       0.58      0.34      0.43       161
        True       0.67      0.84      0.74       252

   micro avg       0.65      0.65      0.65       413
   macro avg       0.62      0.59      0.59       413
weighted avg       0.63      0.65      0.62       413



In [7]:
from sklearn.linear_model import LogisticRegression
myclassifier = LogisticRegression(solver='lbfgs')
myclassifier.fit(X_train, y_train)

y_pred = myclassifier.predict(X_test)

In [8]:
from sklearn.svm import SVC
from sklearn import preprocessing

# !!! We normalize our features to have M = 0 and SD = 1
# This is necessary as our features are not measured on the same scale, which SVM requires
# It may also be OK to rescale to a range of [0:1] or [-1:1]

scaler = preprocessing.Stfrom sklearn import preprocessing
andardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

myclassifier = SVC(gamma='scale')
myclassifier.fit(X_train_scaled, y_train)

y_pred = myclassifier.predict(X_test_scaled)

  return self.partial_fit(X, y)
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


In [9]:
from sklearn.ensemble import RandomForestClassifier
myclassifier = RandomForestClassifier(n_estimators=100)
myclassifier.fit(X_train, y_train)

y_pred = myclassifier.predict(X_test)


In [10]:
from sklearn.model_selection import cross_val_score
myclassifier = LogisticRegression(solver='lbfgs')
accuracy = cross_val_score(estimator=myclassifier, X=X_train, y=y_train, scoring='accuracy', cv=5)
print(accuracy)
print("M = {:.2f}, SD = {:.3f}".format(accuracy.mean(), accuracy.std()), )

[0.64652568 0.64048338 0.62727273 0.64242424 0.63636364]
M = 0.64, SD = 0.007


In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import cohen_kappa_score, make_scorer
f1scores = cross_val_score(estimator=myclassifier, X=X_train, y=y_train, scoring=make_scorer(cohen_kappa_score), cv=5)
print(f1scores)
print("M = {:.2f}, SD = {:.3f}".format(f1scores.mean(), f1scores.std()))


[0.1948816  0.19576536 0.15065913 0.1788275  0.16490932]
M = 0.18, SD = 0.017


In [14]:
from sklearn.model_selection import GridSearchCV

myclassifier = RandomForestClassifier()

grid = {
    'n_estimators' : [10, 50, 100, 200], 
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}
search = GridSearchCV(estimator=myclassifier,
                     param_grid=grid,
                     scoring='f1',
                     cv=5)
search.fit(X_train, y_train)
print('Using these hyperparameters {}, we get the best performance:'.format(search.best_params_))
print(classification_report(y_test, search.predict(X_test)))

Using these hyperparameters {'bootstrap': True, 'n_estimators': 200, 'criterion': 'gini'}, we get the best performance:
              precision    recall  f1-score   support

       False       0.43      0.37      0.40       161
        True       0.63      0.69      0.66       252

   micro avg       0.57      0.57      0.57       413
   macro avg       0.53      0.53      0.53       413
weighted avg       0.56      0.57      0.56       413

