In [84]:
import pandas as pd
import random
import numpy as np

random_seed = 1998

random.seed(random_seed) # 设置Python全局的random种子
np.random.seed(random_seed)  # 设置numpy的随机种子   
df = pd.read_parquet('data230711.parquet')
df_0 = df[df['landslide'] == 0].sample(97,replace=False) #replace=False时，函数在进行采样时会确保每个元素只被选择一次
df_1 = df[df['landslide'] == 1].sample(97,replace=False)
data = pd.concat([df_0,df_1],axis=0)
data

Unnamed: 0,aspect,dem,faults,landform,landslide,landuse,plan,profile,rainaverage,river,slope,twi
4092029,9,1,8,4,0,6,6,4,75.11,8,4,3
11098540,9,2,2,1,0,4,5,4,66.94,8,1,3
6918169,4,3,7,1,0,4,8,2,74.91,3,7,1
4472055,7,1,8,18,0,8,4,4,76.06,1,1,8
633381,2,3,8,1,0,4,7,2,78.01,2,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...
9525032,5,4,3,3,1,6,2,7,73.72,3,4,5
7043355,6,3,7,3,1,6,6,4,74.17,5,3,2
1318338,6,1,8,3,1,1,5,3,73.07,8,3,3
4817144,4,1,8,8,1,6,5,4,76.01,6,2,3


数据归一化

In [86]:

# 计算每列的最小值和最大值
min_values = data.min()
max_values = data.max()

# 进行 Min-Max 归一化
normalized_data = (data - min_values) / (max_values - min_values)
normalized_data


Unnamed: 0,aspect,dem,faults,landform,landslide,landuse,plan,profile,rainaverage,river,slope,twi
4092029,1.000000,0.000000,1.000000,0.166667,0.0,0.625,0.666667,0.428571,0.613900,1.000000,0.428571,0.285714
11098540,1.000000,0.166667,0.142857,0.000000,0.0,0.375,0.500000,0.428571,0.219595,1.000000,0.000000,0.285714
6918169,0.285714,0.333333,0.857143,0.000000,0.0,0.375,1.000000,0.142857,0.604247,0.285714,0.857143,0.000000
4472055,0.714286,0.000000,1.000000,0.944444,0.0,0.875,0.333333,0.428571,0.659749,0.000000,0.000000,1.000000
633381,0.000000,0.333333,1.000000,0.000000,0.0,0.375,0.833333,0.142857,0.753861,0.142857,0.285714,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
9525032,0.428571,0.500000,0.285714,0.111111,1.0,0.625,0.000000,0.857143,0.546815,0.285714,0.428571,0.571429
7043355,0.571429,0.333333,0.857143,0.111111,1.0,0.625,0.666667,0.428571,0.568533,0.571429,0.285714,0.142857
1318338,0.571429,0.000000,1.000000,0.111111,1.0,0.000,0.500000,0.285714,0.515444,1.000000,0.285714,0.285714
4817144,0.285714,0.000000,1.000000,0.388889,1.0,0.625,0.500000,0.428571,0.657336,0.714286,0.142857,0.285714


计算VIF值

In [92]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# 假设你有一个 DataFrame，包含所有的自变量
# 例如，X 是一个 DataFrame，其中每列代表一个自变量
data =normalized_data.drop('landslide',axis=1)
# 计算 VIF 值
vif = pd.DataFrame()
vif['Features'] = data.columns
vif['VIF'] = [variance_inflation_factor(data.values, i) for i in range(data.shape[1])]

vif

Unnamed: 0,Features,VIF
0,aspect,2.973135
1,dem,3.130396
2,faults,3.468072
3,landform,2.343297
4,landuse,8.384701
5,plan,5.851202
6,profile,5.891464
7,rainaverage,6.773529
8,river,3.455474
9,slope,5.148165


In [94]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
import numpy as np

x =data

# 当VIF<10,说明不存在多重共线性；当10<=VIF<100,存在较强的多重共线性，当VIF>=100,存在严重多重共线性
tol = [1./variance_inflation_factor(x.values, x.columns.get_loc(i)) for i in x.columns]
print(tol)



[0.33634533361163566, 0.31944840910099503, 0.28834460808213025, 0.4267491349017915, 0.11926483911999497, 0.1709050473471011, 0.16973708379051478, 0.14763352133141527, 0.28939593683317466, 0.1942439644100793, 0.24409233076609307]


In [95]:
vif["Tolerance"]=tol

In [99]:
vif.round(3).to_excel('VIF.xlsx')