In [1]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd

In [2]:
def dnorm(mean, variance, size=1):
    if isinstance(size, int):
        size = size,
        return mean + np.sqrt(variance) * np.random.randn(*size)
# 用于复现
np.random.seed(12345)
N = 100
X = np.c_[dnorm(0, 0.4,size=N),
          dnorm(0, 0.6, size=N),
          dnorm(0, 0.2, size=N) 
         ]
eps = dnorm(0, 0.1, size=N)
beta = [0.1, 0.3, 0.5]
y = np.dot(X, beta) + eps


In [3]:
X[:5]

array([[-0.12946849, -1.21275292,  0.50422488],
       [ 0.30291036, -0.43574176, -0.25417986],
       [-0.32852189, -0.02530153,  0.13835097],
       [-0.35147471, -0.71960511, -0.25821463],
       [ 1.2432688 , -0.37379916, -0.52262905]])

In [4]:
X_model = sm.add_constant(X)

In [5]:
X_model[:5]

array([[ 1.        , -0.12946849, -1.21275292,  0.50422488],
       [ 1.        ,  0.30291036, -0.43574176, -0.25417986],
       [ 1.        , -0.32852189, -0.02530153,  0.13835097],
       [ 1.        , -0.35147471, -0.71960511, -0.25821463],
       [ 1.        ,  1.2432688 , -0.37379916, -0.52262905]])

In [6]:
y

array([ 0.42786349, -0.67348041, -0.09087764, -0.48949442, -0.12894109,
       -0.04501494,  0.08757735, -0.50456809, -0.54582359,  0.26527124,
        0.59784431,  0.45268655,  0.08698737,  0.05540612, -0.09117045,
        0.14472907, -0.15127161, -0.05633559,  1.2167688 , -0.02230032,
       -0.69063922,  0.08524475,  0.73444882, -0.35271834, -0.25469893,
        0.30780133,  0.70383282, -0.5331801 , -0.22072084, -0.09677542,
       -0.49691476, -1.33344177, -0.37685375,  1.25999316, -0.29484543,
       -0.61445479,  0.18725508, -0.40779804,  0.05730302,  0.4745453 ,
       -0.43516233,  0.03148314, -0.05635841,  0.12133475,  0.22345618,
        0.05955794,  0.25805322, -0.2750181 ,  0.30513496, -0.20032791,
        0.08627269, -0.42451706,  0.23481135, -0.32057314,  0.67561398,
       -0.38726135, -0.37863875, -0.16376385, -0.17011089,  0.39236031,
       -0.13687819,  0.18865275, -0.13990581,  0.61372834, -0.40825235,
        0.46866481, -0.59632133, -0.07708193,  0.70818684,  0.14

In [7]:
X

array([[-1.29468492e-01, -1.21275292e+00,  5.04224878e-01],
       [ 3.02910364e-01, -4.35741756e-01, -2.54179861e-01],
       [-3.28521889e-01, -2.53015334e-02,  1.38350968e-01],
       [-3.51474705e-01, -7.19605110e-01, -2.58214633e-01],
       [ 1.24326880e+00, -3.73799164e-01, -5.22629046e-01],
       [ 8.81267227e-01, -2.80898544e-02, -3.68960148e-01],
       [ 5.87601006e-02,  8.48485492e-01, -1.18261588e+00],
       [ 1.78191913e-01,  7.59823931e-01, -6.84173312e-02],
       [ 4.86372577e-01, -4.56615198e-01, -3.36269295e-01],
       [ 7.88314544e-01,  1.22517962e+00, -5.93046604e-02],
       [ 6.37002481e-01, -4.09556235e-01,  6.51724241e-01],
       [-8.19802211e-01,  3.53992127e-01,  2.72581984e-01],
       [ 1.73919980e-01,  7.20350703e-01, -2.20824797e-01],
       [ 1.44777217e-01, -1.21555178e+00,  5.54535862e-01],
       [ 8.55659737e-01, -7.92015008e-01, -6.06967863e-02],
       [ 5.60627140e-01, -3.12028394e-01,  6.39534138e-01],
       [-1.26594659e+00,  1.70788390e-01

In [8]:
model = sm.OLS(y, X)

In [9]:
results = model.fit()

In [10]:
results.params

array([0.17826108, 0.22303962, 0.50095093])

In [11]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.430
Model:                            OLS   Adj. R-squared:                  0.413
Method:                 Least Squares   F-statistic:                     24.42
Date:                Wed, 29 May 2019   Prob (F-statistic):           7.44e-12
Time:                        14:44:26   Log-Likelihood:                -34.305
No. Observations:                 100   AIC:                             74.61
Df Residuals:                      97   BIC:                             82.42
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.1783      0.053      3.364      0.0

In [12]:
data = pd.DataFrame(X, columns=['col0', 'col1', 'col2'])

In [13]:
data['Y'] = y

In [14]:
data.head()

Unnamed: 0,col0,col1,col2,Y
0,-0.129468,-1.212753,0.504225,0.427863
1,0.30291,-0.435742,-0.25418,-0.67348
2,-0.328522,-0.025302,0.138351,-0.090878
3,-0.351475,-0.719605,-0.258215,-0.489494
4,1.243269,-0.373799,-0.522629,-0.128941


In [15]:
results = smf.ols('y ~ col0 + col1 + col2', data=data).fit()

In [16]:
results.params

Intercept    0.033559
col0         0.176149
col1         0.224826
col2         0.514808
dtype: float64

In [17]:
results.tvalues

Intercept    0.952188
col0         3.319754
col1         4.850730
col2         6.303971
dtype: float64

In [18]:
results.predict(data[:5])

0   -0.002327
1   -0.141904
2    0.041226
3   -0.323070
4   -0.100535
dtype: float64

In [19]:
init_x = 4

In [20]:
import random

In [21]:
values = [init_x, init_x]

In [22]:
b0 = 0.8

In [23]:
b1 = -0.4

In [24]:
noise = dnorm(0, 0.1, N)

In [25]:
for i in range(N):
    new_x = values[-1] * b0 + values[-2] * b1 + noise[i]
    values.append(new_x)

In [26]:
MAXLAG5 = 5

In [27]:
model = sm.tsa.AR(values)

In [28]:
results = model.fit(MAXLAG5)

In [29]:
results.params

array([-0.00598371,  0.91770719, -0.48809412,  0.10959298, -0.14387483,
        0.13791896])

In [30]:
train = pd.read_csv(r'C:/Users/Administrator/Desktop/pydata-book-2nd-edition/datasets/titanic/train.csv')

In [31]:
test = pd.read_csv(r'C:/Users/Administrator/Desktop/pydata-book-2nd-edition/datasets/titanic/test.csv')

In [32]:
train[:4]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [33]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [34]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [35]:
impute_value = train['Age'].median()

In [36]:
train['Age'] = train['Age'].median()

In [38]:
test['Age'] = test['Age'].fillna(impute_value)

In [39]:
train['IsFemale'] = (train['Sex'] == 'female').astype(int)

In [43]:
test['IsFemale'] = (test['Sex'] == 'female').astype(int)

In [44]:
predictors = ['Pclass', 'IsFemale', 'Age']

In [45]:
X_train = train[predictors].values

In [46]:
X_test = test[predictors].values

In [47]:
y_train = train['Survived'].values

In [48]:
X_train[:5]

array([[ 3.,  0., 28.],
       [ 1.,  1., 28.],
       [ 3.,  1., 28.],
       [ 1.,  1., 28.],
       [ 3.,  0., 28.]])

In [49]:
y_train[:5]

array([0, 1, 1, 1, 0], dtype=int64)

In [50]:
from sklearn.linear_model import LogisticRegression

In [51]:
model = LogisticRegression()

In [52]:
model.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [53]:
y_predict = model.predict(X_test)

In [54]:
y_predict[:10]

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0], dtype=int64)

In [59]:
model_cv = LogisticRegression()

In [60]:
model_cv.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [66]:
from sklearn.linear_model import LogisticRegressionCV
model_cv = LogisticRegressionCV(10)
model_cv.fit(X_train, y_train)



LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='warn', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

In [62]:
from sklearn.model_selection import cross_val_score

In [63]:
score = cross_val_score(model, X_train, y_train, cv=4)



In [64]:
score

array([0.79464286, 0.81165919, 0.76126126, 0.77927928])