### 一、用 statsmodel 实现逻辑回归

In [1]:
import numpy as np
import pandas as pd

import statsmodels.api as sm
from statsmodels.discrete.discrete_model import Logit, Probit, MNLogit

import warnings
import matplotlib as mpl
warnings.filterwarnings('ignore')

### 1、读取数据

In [3]:
data=pd.read_csv('../data/book_ipo.csv',index_col=0)
data[['ir','nasdaq15_pos', 'lrassets','vc','odate']]

Unnamed: 0,ir,nasdaq15_pos,lrassets,vc,odate
0,10.000000,6.35,-0.295515,0,1980
1,-5.000000,4.97,1.251122,0,1980
2,28.571429,0.00,5.423085,0,1980
3,-5.555556,0.00,3.464674,0,1980
4,15.277778,0.00,3.402023,1,1980
...,...,...,...,...,...
8612,-10.785714,3.38,4.428552,1,2019
8613,0.000000,3.38,4.511299,1,2019
8614,2.692308,3.23,3.282038,1,2019
8615,30.000000,4.52,3.106826,1,2019


### 2、二分类

In [4]:
data['ir_g2']=1*(data['ir']>0)
data.columns

Index(['odate', 'mm', 'dd', 'op', 'perm', 'primary', 'netdum', 'vc', 'dual',
       'spinoff', 'lo', 'hi', 'eps12', 'nummgr', 'rollup', 'crspvw15',
       'nasdaq15', 'toptier', 'mid', 'china', 'foreign', 'ir', 'prime',
       'nasdaq', 'copro', 'secreval', 'buyout', 'revision', 'overhang', 'sole',
       'multbook', 'financial', 'biotech', 'tech', 'realpro', 'realsales',
       'unicorn', 'realmvop', 'lnrealmvop', 'rassets', 'age', 'three', 'lage',
       'psrop', 'fulldate', 'auction', 'ir21', 'ff17vw21d', 'ff17ew21d',
       'ir_avg3m', 'ir_avgf3m', 'ir_avg6m', 'ir_avgf6m', 'ir_avg1y',
       'lnipo_n3m', 'lnipo_nf3m', 'lnipo_n6m', 'lnipo_nf6m', 'lnipo_n1y',
       'op_above', 'op_in', 'op_below', 'eps12_pos', 'file_range',
       'file_range_pct', 'op_integer', 'op_mid', 'op_mid_pos', 'op_mid_neg',
       'hi_integer', 'lo_integer', 'revision_pos', 'revision_neg', 'lnummgr',
       'lrealsales', 'lrassets', 'crspvw15_pos', 'crspvw15_neg',
       'nasdaq15_pos', 'nasdaq15_neg', 'ff1

In [7]:


X = data[['nasdaq15_pos', 'lrassets','vc']]
Y = data['ir_g2']
probit_model = Probit(Y, sm.add_constant(X))
result = probit_model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.592544
         Iterations 5


0,1,2,3
Dep. Variable:,ir_g2,No. Observations:,8583.0
Model:,Probit,Df Residuals:,8579.0
Method:,MLE,Df Model:,3.0
Date:,"Sun, 09 Nov 2025",Pseudo R-squ.:,0.02165
Time:,23:18:47,Log-Likelihood:,-5085.8
converged:,True,LL-Null:,-5198.3
Covariance Type:,nonrobust,LLR p-value:,1.635e-48

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.2247,0.041,5.526,0.000,0.145,0.304
nasdaq15_pos,0.0719,0.006,12.738,0.000,0.061,0.083
lrassets,0.0221,0.008,2.879,0.004,0.007,0.037
vc,0.1953,0.030,6.445,0.000,0.136,0.255


In [8]:
Y

0       1
1       0
2       1
3       0
4       1
       ..
8612    0
8613    0
8614    1
8615    1
8616    1
Name: ir_g2, Length: 8583, dtype: int32

In [9]:
logist_model = Logit(Y, sm.add_constant(X))
result = logist_model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.592456
         Iterations 5


0,1,2,3
Dep. Variable:,ir_g2,No. Observations:,8583.0
Model:,Logit,Df Residuals:,8579.0
Method:,MLE,Df Model:,3.0
Date:,"Sun, 09 Nov 2025",Pseudo R-squ.:,0.02179
Time:,23:18:49,Log-Likelihood:,-5085.1
converged:,True,LL-Null:,-5198.3
Covariance Type:,nonrobust,LLR p-value:,7.732e-49

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.3496,0.067,5.215,0.000,0.218,0.481
nasdaq15_pos,0.1248,0.010,12.403,0.000,0.105,0.144
lrassets,0.0358,0.013,2.804,0.005,0.011,0.061
vc,0.3235,0.051,6.378,0.000,0.224,0.423


###  3、三分类

In [10]:
data['ir_g3']=1*(data['ir']>10)-1*(data['ir']<-10)
data

Unnamed: 0,odate,mm,dd,op,perm,primary,netdum,vc,dual,spinoff,...,crspvw15_pos,crspvw15_neg,nasdaq15_pos,nasdaq15_neg,ff17vw21d_pos,ff17vw21d_neg,ff17ew21d_pos,ff17ew21d_neg,ir_g2,ir_g3
0,1980,1,30,7.5,64849,100.00,0,0,0,0,...,5.05,0.00,6.35,0.00,15.147995,0.000000,33.778513,0.000000,1,0
1,1980,2,13,5.0,61400,100.00,0,0,0,9,...,6.09,0.00,4.97,0.00,6.246483,0.000000,7.283862,0.000000,0,0
2,1980,2,28,14.0,62519,100.00,0,0,0,9,...,0.00,-1.42,0.00,-2.26,12.688880,0.000000,10.515948,0.000000,1,1
3,1980,3,4,13.5,64397,26.09,0,0,0,0,...,0.00,-3.98,0.00,-4.58,0.000000,-4.728272,0.000000,-1.675119,0,0
4,1980,3,11,18.0,20590,46.13,0,1,0,0,...,0.00,-8.13,0.00,-10.92,0.000000,-8.819079,0.000000,-11.473117,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8612,2019,11,7,14.0,19067,100.00,0,1,0,0,...,2.92,0.00,3.38,0.00,6.832362,0.000000,6.297551,0.000000,0,-1
8613,2019,11,7,12.0,19072,100.00,0,1,0,0,...,2.92,0.00,3.38,0.00,6.832362,0.000000,6.297551,0.000000,0,0
8614,2019,11,8,13.0,19082,100.00,0,1,0,0,...,2.85,0.00,3.23,0.00,6.229506,0.000000,6.191285,0.000000,1,0
8615,2019,11,11,16.0,19070,100.00,0,1,0,0,...,3.54,0.00,4.52,0.00,3.450139,0.000000,6.516494,0.000000,1,1


In [11]:

mnLogit_model = MNLogit(data['ir_g3'], sm.add_constant(X))
result = mnLogit_model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.756801
         Iterations 8


0,1,2,3
Dep. Variable:,ir_g3,No. Observations:,8583.0
Model:,MNLogit,Df Residuals:,8575.0
Method:,MLE,Df Model:,6.0
Date:,"Sun, 09 Nov 2025",Pseudo R-squ.:,0.03228
Time:,23:18:51,Log-Likelihood:,-6495.6
converged:,True,LL-Null:,-6712.3
Covariance Type:,nonrobust,LLR p-value:,1.917e-90

ir_g3=0,coef,std err,z,P>|z|,[0.025,0.975]
const,3.2688,0.213,15.369,0.000,2.852,3.686
nasdaq15_pos,0.0086,0.029,0.292,0.770,-0.049,0.066
lrassets,0.0569,0.041,1.380,0.168,-0.024,0.138
vc,-0.9234,0.144,-6.406,0.000,-1.206,-0.641
ir_g3=1,coef,std err,z,P>|z|,[0.025,0.975]
const,2.5901,0.215,12.072,0.000,2.170,3.011
nasdaq15_pos,0.1237,0.029,4.245,0.000,0.067,0.181
lrassets,0.0376,0.042,0.904,0.366,-0.044,0.119
vc,-0.3215,0.145,-2.220,0.026,-0.605,-0.038


### 二、用sklearn实现逻辑回归

### （1）第一步：导入逻辑回归模型

In [12]:
from sklearn.linear_model import LogisticRegression

### （2）第二步：实例化模型

创建逻辑回归模型的实例是使用逻辑回归的第一步。可以直接实例化`LogisticRegression`类，并根据需要设置参数。例如：

model = LogisticRegression()

In [13]:
model = LogisticRegression(penalty='l2', 
                           dual=False, 
                           tol=0.0001, 
                           C=1.0, 
                           fit_intercept=True, 
                           intercept_scaling=1, 
                           class_weight=None, 
                           random_state=None, 
                           solver='lbfgs', 
                           max_iter=100, 
                           multi_class='auto', 
                           verbose=0, 
                           warm_start=False, 
                           n_jobs=None, 
                           l1_ratio=None)

这些参数的简要说明如下：

penalty：指定惩罚（正则化项）类型，用于避免过拟合，可以是 'l1'、'l2'、'elasticnet' 或 'none'。

C：正则化强度的倒数，较小的值指定更强的正则化。

fit_intercept：指定是否应该向决策函数中添加常数项（也称截距）。

intercept_scaling：仅在使用求解器 'liblinear' 且 `fit_intercept` 为 True 时有用。当特征值相差很大时，此参数有助于稳定 'liblinear' 的收敛。

class_weight：用于标示各个类别的权重，通常用于处理类别不平衡的问题。如果不设置，则所有类的权重都为 1。

random_state：是随机数生成器的种子。在需要重现结果的情况下用于初始化中心。

solver：指定在优化问题中使用的算法，如 'liblinear'、'newton-cg'、'lbfgs'、'sag' 和 'saga'。

max_iter：求解器内部迭代的最大次数，即优化算法的迭代次数。

multi_class：如果是多类（非二分类）问题，则指定用于多类策略的算法，可以是 'ovr'（一对剩余）或 'multinomial'。

verbose：对于某些求解器，正整数的`verbose`参数可以用于显示求解器中的进程信息。

warm_start：设为True时，使用前一个调用的解决方案作为初始化，否则，只是擦除前一个解决方案。

n_jobs：用于指定并行作业的数量。`None`意味着1，而-1意味着使用所有处理器。

l1_ratio：仅在`penalty='elasticnet'`时使用，这是Elastic-Net混合参数，其中 0 <= l1_ratio <= 1。

在实际用途中，通常需要调整这些参数以合适地适配数据集，改进模型的性能。

### （3）第三步：拟合模型

拟合模型涉及模型的训练。使用`fit`方法，你可以将数据集（特征向量和目标变量）输入模型进行训练。例如：

model.fit(X_train, y_train)

这里，`X_train`是包含特征的数组或矩阵，而`y_train`是包含目标变量（标签）的数组。

In [14]:
X_train=X
y_train=Y

In [15]:
model.fit(X_train, y_train)

### （4）第四步：预测

一旦模型被训练，就可以对新的数据进行预测。使用`predict`方法来预测新数据点的类别，或者使用`predict_proba`方法来获得属于各个类别的概率。例如：

predictions = model.predict(X_test)
probabilities = model.predict_proba(X_test)
这里，`X_test`是需要进行预测的数据集。

In [None]:
preds=model.predict(X_train)

### （5）第五步：评估模型

`sklearn`提供了多种评估分类模型性能的方法。例如，你可以使用`accuracy_score`方法来计算模型的准确率：

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_train, preds)
这里，`y_train`包含了测试集的真实标签。

In [17]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_train, preds)
accuracy

0.7060468367703601