In [2425]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, MinMaxScaler
from sklearn.linear_model import LinearRegression,Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.feature_selection import mutual_info_regression
from sklearn.pipeline import Pipeline

import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2427]:
df = pd.read_csv("train.csv")

In [2429]:
def complaints_count_cleaning(data: pd.core.frame.DataFrame)->pd.core.frame.DataFrame:
    data["complaints_count"] = pd.to_numeric(data["complaints_count"].replace("-", np.nan))
    return data

In [2431]:
def cleaning_data_reg_date(data: pd.core.frame.DataFrame)->pd.core.frame.DataFrame:
    data["date_of_registration"] = pd.to_datetime(data["date_of_registration"])
    data["register_coef"] = (pd.Timestamp("2024-10-15")-data["date_of_registration"])/pd.Timedelta(days=365)
    data = data.drop(columns="date_of_registration")

    return data

In [2433]:
def cleaning_data_average_dwelltime(data: pd.core.frame.DataFrame)->pd.core.frame.DataFrame:
    data = data[data["average_dwelltime"]>=0]
    return data
    

In [2435]:
def add_new_features(data_clean: pd.core.frame.DataFrame)->pd.core.frame.DataFrame:
    data_clean["errors"] = data_clean["5xx_errors"]+data_clean["4xx_errors"]
    #data_clean["errors^2"] = data_clean["errors"]*data_clean["errors"]
    
    data_clean["buys^2"] = data_clean["buys"]*data_clean["buys"]
    data_clean["register_coef^2"] = data_clean["register_coef"]*data_clean["register_coef"]
    data_clean['involvement'] = (data_clean["buys"]+data_clean["clicks"]+data_clean["likes"])/data_clean["register_coef"]

    data_clean["likes^2"] = data_clean["likes"]*data_clean["likes"]
    data_clean["errors_per_clicks"] = data_clean['errors']/data_clean["clicks"]
    data_clean['buy_per_clicks'] = data_clean['buys'] / data_clean['clicks']
    #data_clean['complain_per_clicks'] = data_clean['complaints_count'] / data_clean['clicks']
    
    data_clean['buy_per_clicks^2'] = data_clean['buy_per_clicks'] * data_clean['buy_per_clicks']
    
    data_clean['likes_per_cliks'] = data_clean['likes'] / data_clean['clicks']
    data_clean['likes_per_cliks^2'] = data_clean['likes_per_cliks'] * data_clean['likes_per_cliks']
    
    #data_clean["reg_per_clicks"] = data_clean["clicks"]/ data_clean["register_coef"]
    #data_clean["reg_per_clicks^2"] = data_clean["reg_per_clicks"]* data_clean["reg_per_clicks"]
    
    data_clean["clicks^2"] = data_clean["clicks"]*data_clean["clicks"]
    #data_clean["reg_per_buys"] = data_clean["buys"]/ data_clean["register_coef"]
    
    

    #data_clean['buy_per_clicks^2'] = data_clean['buy_per_clicks'] * data_clean['buy_per_clicks']
    #data_clean['likes_per_cliks^2'] = data_clean['likes_per_cliks'] * data_clean['likes_per_cliks']
    return data_clean

In [2437]:
def cleaning_final(data: pd.core.frame.DataFrame)->pd.core.frame.DataFrame:
    data_final = data.copy(deep=True)
    data_final = complaints_count_cleaning(data_final)
    data_final = cleaning_data_reg_date(data_final)
    data_final = cleaning_data_average_dwelltime(data_final)
    data_final = add_new_features(data_final)
    data_final= data_final.drop(columns="4xx_errors")
    #data_final= data_final.drop(columns="5xx_errors")
    
    data_final = data_final.drop(columns="Unnamed: 0")
    data_final = data_final.drop(columns="errors")
    
    #data_final= data_final.drop(columns="5xx_errors")
    #data_final= data_final.drop(columns="complaints_count")
    
    
    data_final = data_final.dropna()

    return data_final

In [2439]:
#data_train = cleaning_final(df)

In [2441]:
data_train

Unnamed: 0,category,clicks,likes,buys,5xx_errors,complaints_count,average_dwelltime,source_attractiveness,register_coef,buys^2,register_coef^2,likes^2,errors_per_clicks,buy_per_clicks,likes_per_cliks,reg_per_clicks
1,information_source,874840.0,21100.0,0,0,0.0,10.721619,-0.022317,0.238375,0,0.056823,4.452100e+08,0.014714,0.0,0.024119,3.670016e+06
2,information_source,571210.0,94707.0,0,7420,0.0,1.922243,0.046396,0.255639,0,0.065352,8.969416e+09,0.012990,0.0,0.165801,2.234436e+06
3,news,89534.0,924.0,0,0,0.0,2.149243,-0.093360,0.093438,0,0.008731,8.537760e+05,0.009315,0.0,0.010320,9.582231e+05
4,information_source,1043953.0,289288.0,0,20260,3948.0,3.764965,0.027303,0.387772,0,0.150367,8.368755e+10,0.075324,0.0,0.277108,2.692186e+06
6,porn,99712.0,0.0,0,15178,3725.0,20.786928,0.063572,1.473325,0,2.170687,0.000000e+00,0.152218,0.0,0.000000,6.767820e+04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7994,information_source,4221099.0,1204744.0,0,24270,8234.0,0.451923,0.115761,0.887457,0,0.787579,1.451408e+12,0.085400,0.0,0.285410,4.756400e+06
7995,information_source,1468601.0,474524.0,0,83928,16841.0,4.191481,0.017470,1.321027,0,1.745113,2.251730e+11,0.098893,0.0,0.323113,1.111711e+06
7996,information_source,93172.0,6418.0,0,705,0.0,3.745192,-0.019069,0.231877,0,0.053767,4.119072e+07,0.044251,0.0,0.068883,4.018173e+05
7997,information_source,82916.0,0.0,0,1718,1007.0,5.837475,-0.061523,0.328112,0,0.107658,0.000000e+00,0.072604,0.0,0.000000,2.527061e+05


In [2556]:
def train_and_predict(df):
    data_clean_ohe = pd.get_dummies(df, columns=['category'], dtype=np.int16)
    
   
    Y = data_clean_ohe["source_attractiveness"]
    X = data_clean_ohe.drop(columns="source_attractiveness")
    
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=10)
    
    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  
        ('poly', PolynomialFeatures(interaction_only=True, include_bias=False)),  
        ('ridge', Ridge())  
    ])
    
    param_grid = {
        'ridge__alpha': [ 5,10,20],  
        'poly__degree': [1,2]  
    }
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=7, scoring='neg_mean_squared_error')
    
    grid_search.fit(X_train, Y_train)
    
    print("Лучшие параметры: ", grid_search.best_params_)
    print("Лучший MSE на тренировочных данных: %.6f"%-grid_search.best_score_)
    
    y_pred = grid_search.best_estimator_.predict(X_test)
    print(X_test.shape)
    test_mse = mean_squared_error(Y_test, y_pred)
    print("MSE на тестовых данных: %.6f"% test_mse)
    
    cv_scores = cross_val_score(grid_search.best_estimator_, X_train, Y_train, cv=5, scoring='neg_mean_squared_error')
    print(f"Кросс-валидация MSE: %.6f"% -cv_scores.mean())
    return grid_search

In [2558]:
#model = train_and_predict(data_train)

#### BEST = 0.001121

In [2561]:
def cleaning_data_average_dwelltime_for_mean(data: pd.core.frame.DataFrame)->pd.core.frame.DataFrame:
    data_mean_ad = data[data["average_dwelltime"]>=0]
    mean_ad = data_mean_ad["average_dwelltime"].mean()
    data.loc[data['average_dwelltime'] < 0, 'average_dwelltime'] = mean_ad
    #print(mean_ad)
    return data


In [2563]:
def complaints_count_cleaning_for_mean(data: pd.core.frame.DataFrame)->pd.core.frame.DataFrame:
    data["complaints_count"] = pd.to_numeric(data["complaints_count"].replace("-", np.nan))
    data['complaints_count'] = data['complaints_count'].fillna(data['complaints_count'].mean())
    return data
    

In [2565]:
def likes_mean_cleaning(data: pd.core.frame.DataFrame)->pd.core.frame.DataFrame:
    data['likes'] = data['likes'].fillna(0)
    return data

def clicks_mean_cleaning(data: pd.core.frame.DataFrame)->pd.core.frame.DataFrame:
    data['clicks'] = data['clicks'].fillna(data["clicks"].mean())
    return data

In [2567]:
def mean_data_cleaning_for_test(data: pd.core.frame.DataFrame)->pd.core.frame.DataFrame:
    data_final = data.copy(deep=True)
    data_final = complaints_count_cleaning_for_mean(data_final)
    data_final = cleaning_data_reg_date(data_final)
    data_final = cleaning_data_average_dwelltime_for_mean(data_final)
    data_final = clicks_mean_cleaning(data_final)
    data_final = likes_mean_cleaning(data_final)
    #df_numeric = data_final.select_dtypes(include=[np.number])
    #data_final[df_numeric.columns] = df_numeric.fillna(df_numeric.mean())
    data_final = add_new_features(data_final)
    data_final= data_final.drop(columns="4xx_errors")
    
    
    #data_final= data_final.drop(columns="5xx_errors")
    
    #data_final = data_final.drop(columns="ID")
    data_final = data_final.drop(columns="errors")
    
    return data_final

In [2570]:
#model = train_and_predict(data_orig)

In [2572]:
data_test = pd.read_csv("test.csv")

In [2574]:
data_orig = mean_data_cleaning_for_test(df)

In [2576]:
data_orig = data_orig.drop(columns="Unnamed: 0")

In [2578]:
model = train_and_predict(data_orig)

Лучшие параметры:  {'poly__degree': 1, 'ridge__alpha': 20}
Лучший MSE на тренировочных данных: 0.007204
(1600, 22)
MSE на тестовых данных: 0.003885
Кросс-валидация MSE: 0.006228


In [2581]:
def predict(data_path):
    data = pd.read_csv(data_path)
    data = mean_data_cleaning_for_test(data)
    data = data.drop(columns="ID")
    
    data = pd.get_dummies(data, columns=['category'], dtype=np.int16)
    predicts = model.predict(data)
    
    return predicts

In [2583]:
data = {
    "source_attractiveness": predict("test.csv")
}
submit = pd.DataFrame(data)
submit.to_csv('submission.csv', index_label="ID")