In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error as mse, precision_score as precision, recall_score as recall, accuracy_score as accuracy
import os


from graph import *
from graph_characteristic import *
from graph_simulate_statistics import *

# **Нагенерируем датасетиков для разных размеров выборок**

In [2]:
df_25 = generate_dataset(gen_gamma_points, gen_weibull_points, dataset_size=10000, num_vertex=25)
df_25.to_csv(os.path.join('..', 'data', 'df_25_vert_Student2.csv'), index=False)

100%|██████████| 5000/5000 [00:28<00:00, 172.77it/s]


In [4]:
df_100 = generate_dataset(gen_gamma_points, gen_weibull_points, dataset_size=5000, num_vertex=100)
df_100.to_csv(os.path.join('..', 'data', 'df_100_vert_Student2.csv'), index=False)

100%|██████████| 2500/2500 [04:54<00:00,  8.48it/s] 


In [5]:
df_500 = pd.read_csv(os.path.join('..', 'data', 'df_500_vert_Student2.csv'))
for i in range( (100 - df_500.shape[0]) // 10 ):
    print("data size = ", df_500.shape[0])
    df_batch = generate_dataset(gen_gamma_points, gen_weibull_points, dataset_size=10, num_vertex=500)
    df_500 = pd.concat([df_500, df_batch], ignore_index=True)
    df_500.to_csv(os.path.join('..', 'data', 'df_500_vert_Student2.csv'), index=False)

# **Поехали обучать**

In [6]:
df_25 = pd.read_csv(os.path.join('..', 'data', 'df_25_vert_Student2.csv'))
df_100 = pd.read_csv(os.path.join('..', 'data', 'df_100_vert_Student2.csv'))
df_500 = pd.read_csv(os.path.join('..', 'data', 'df_500_vert_Student2.csv'))

In [7]:
for n, df in zip([25, 100, 500], [df_25, df_100, df_500]):
    X, y = df.drop('distribution', axis = 1), df['distribution']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=239, stratify=y)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = LogisticRegression(max_iter=1000, class_weight='balanced')
    model.fit(X_train_scaled, y_train)
    y_pred_logistic = model.predict(X_test_scaled)


    model = LinearRegression()
    model.fit(X_train_scaled, y_train)
    y_pred_linear = (model.predict(X_test_scaled) > 0.5).astype(int)


    model = Ridge()
    model.fit(X_train_scaled, y_train)
    y_pred_ridge = (model.predict(X_test_scaled) > 0.5).astype(int)

    model = Lasso()
    model.fit(X_train_scaled, y_train)
    y_pred_lasso = (model.predict(X_test_scaled) > 0.5).astype(int)

    print(f"N = {n}")
    print('\tLogistic')
    print("\t\tPrecision:", precision(y_test, y_pred_logistic))
    print("\t\tAccuracy:", accuracy(y_test, y_pred_logistic))
    print("\t\tRecall:", recall(y_test, y_pred_logistic), "\n")

    print('\tLinear')
    print("\t\tPrecision:", precision(y_test, y_pred_linear))
    print("\t\tAccuracy:", accuracy(y_test, y_pred_linear))
    print("\t\trecall:", recall(y_test, y_pred_linear), "\n")

    print('\tRidge')
    print("\t\tPrecision:", precision(y_test, y_pred_ridge))
    print("\t\tAccuracy:", accuracy(y_test, y_pred_ridge))
    print("\t\trecall:", recall(y_test, y_pred_ridge), "\n")


N = 25
	Logistic
		Precision: 0.8306620209059233
		Accuracy: 0.8163333333333334
		Recall: 0.7946666666666666 

	Linear
		Precision: 0.8507795100222717
		Accuracy: 0.815
		recall: 0.764 

	Ridge
		Precision: 0.8507795100222717
		Accuracy: 0.815
		recall: 0.764 

N = 100
	Logistic
		Precision: 0.9649122807017544
		Accuracy: 0.9593333333333334
		Recall: 0.9533333333333334 

	Linear
		Precision: 0.9817927170868347
		Accuracy: 0.9586666666666667
		recall: 0.9346666666666666 

	Ridge
		Precision: 0.9831697054698457
		Accuracy: 0.9593333333333334
		recall: 0.9346666666666666 

N = 500
	Logistic
		Precision: 1.0
		Accuracy: 1.0
		Recall: 1.0 

	Linear
		Precision: 1.0
		Accuracy: 1.0
		recall: 1.0 

	Ridge
		Precision: 1.0
		Accuracy: 1.0
		recall: 1.0 

