The first I download all the necessary packages.

In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso

import plotly.express as px
import plotly.graph_objects as go
import plotly as py

import pandas as pd
import numpy as np
import string

import os

import scipy.stats as stats
from statsmodels.multivariate.manova import MANOVA

I download the data from the training dataset and them.

In [2]:
df = pd.read_csv('internship_train.csv')
df

FileNotFoundError: [Errno 2] No such file or directory: 'internship_train.csv'

We have a set of 90,000 rows and 54 columns. For more detailed information look at the information in the columns.

In [None]:
df.info()

All columns are numeric. They differ only in the type of numeric format - **int** and **float**.

It is known that all data in the set are encoded and anonymous, so it is impossible to interpret them anymore.

To better understand the numerical data in each column, you need to look at their descriptive statistics.

In [None]:
df.describe()

We can't see information about each column, because not all columns are displayed by default. To fix this, we change the display setting and repeat it.

In [None]:
pd.set_option('display.max_columns', None)

df.describe()

That is better.

Immediately striking are the first 6 columns (0-5). They have similar quartile values and average values. This may indicate a constant distribution of this data. Visualize one of these columns.

In [None]:
_ = df.iloc[:, 46].hist()

This histogram is a confirmation of our assumptions. Let's look at other data.

After a detailed review, the columns can be grouped into separate groups. For example the first group is (0-5, 9-12), the second group is 6, the third group is 7, the fourth group is 8, the sixth group is 13-52 and seventh is target. \
Please note that all but 6 columns have a constant distribution.

Let's start directly with computational work.

Next we look at how the columns correlate with the target.

In [None]:
table_corr = pd.DataFrame()

for i in range(df.shape[1]):
    if i < 53:
        coef_corr = np.corrcoef(df.iloc[:, i], df['target'])[0][1]
        df_corr = pd.DataFrame([coef_corr], columns=[i])
        table_corr = pd.concat([table_corr, df_corr], axis=1)

table_corr

In [None]:
table_descr = pd.DataFrame(table_corr.values.reshape(-1, 1))

table_descr.describe()

Conclusion: the maximum correlation index has columns and the target column is 0.012103. This indicates that there is no correlation between the columns.

Let's try to check the correlation between regular columns.

In [None]:
ex = []

for i in range(53):
    inr = []
    for j in range(53):
        corr=np.corrcoef(df.iloc[:, i], df.iloc[:, j])
        if i == j:
            inr.append(0)
        else:
            inr.append(corr[0][1])
    ex.append(inr)

df_corr = pd.DataFrame(ex, columns=[i for i in range(53)])
df_corr

In [None]:
df_corr.describe()

The descriptive table shows that only between the two columns the correlation coefficient is high. These are columns 6 and 8. Let's look at their values.

In [None]:
df.iloc[:15, 6]

In [None]:
df.iloc[:15, 8]

Let's see what distribution of data of 6 columns depending on 8.

In [None]:
group1 = df[df['8'] == 0]['6'].values
group2 = df[df['8'] == 1]['6'].values

fig = go.Figure()
fig.add_trace(go.Box(y=group1, name="0"))
fig.add_trace(go.Box(y=group2, name="1"))

fig.update_layout(title_text=f"Box Plot of dependency")
fig.show()

It is seen that all negative values of 6 columns are characteristic of 0 8 columns and vice versa.

We will conduct a group analysis to separate the columns that will be used in the analysis. To do this, use the previous division into groups. First, draw in the middle of one column of one group, and then between all columns of one group. \
We create additional functions.

In [None]:
first_group = [str(i) for i in range(6)] + [str(i) for i in range(9, 13)]
second_group = [str(6)]
third_group = [str(7)]
fourth_group = [str(8)]
fifth_group = [str(i) for i in range(13, 53)]

In [None]:
# To analyze groups in one group of columns (ANOVA)
def check_scatter(column):
    group1 = df[df['target'] < 25][column].values

    a = df['target'] >= 25
    b = df['target'] < 50
    group2 = df[a.values & b.values][column].values

    c = df['target'] >= 50
    d = df['target'] <= 75
    group3 = df[c.values & d.values][column].values

    group4 = df[df['target'] > 75][column].values
    
    f_value, p_value = stats.f_oneway(group1, group2, group3, group4)
    
    if p_value < 0.05:
        print(f"У колонці {column} виявлено різницю між квартилями.")

# To analyze groups between all columns of one group (MANOVA)
def manova(columns):
    df_manova = pd.DataFrame([])
    result = ''
    alph = list(string.ascii_lowercase)
    
    for n, column in zip(alph, columns):
        ex = pd.DataFrame(df[column].values, columns=[n])
        df_manova = pd.concat([df_manova, ex], axis=1)
        result += n + ' + '
    
    df_manova = pd.concat([df_manova, df['target']], axis=1)
    maov = MANOVA.from_formula(f'{result[:-2]}  ~ target', data=df_manova)
    print(maov.mv_test())
        
# For visualisation       
def build_box(column):
    group1 = df[df['target'] < 25][column].values

    a = df['target'] >= 25
    b = df['target'] < 50
    group2 = df[a.values & b.values][column].values

    c = df['target'] >= 50
    d = df['target'] <= 75
    group3 = df[c.values & d.values][column].values

    group4 = df[df['target'] > 75][column].values

    fig = go.Figure()
    fig.add_trace(go.Box(y=group1, name="<25"))
    fig.add_trace(go.Box(y=group2, name="25-50"))
    fig.add_trace(go.Box(y=group3, name="50-75"))
    fig.add_trace(go.Box(y=group4, name=">75"))

    fig.update_layout(title_text=f"Box Plot column:{column}")
    fig.show()

Let's start with the first group. Visualize the first representative.

It is seen that there is no difference between the values of column 0 in different groups of the target. This was indicated by the descriptive statistics above. Don't visualize each column we will make ANOVA for each separate column of this group.

The logic of the function was based on a print that printed messages when there was a statistically significant difference between the target groups in one column. \
Now let's analyze the difference between the groups of the target parameter of each column within one group.

We see that all groups are the same. \
Conclusion: this group does not affect the target.

Now we will spend similar with 3, 4 and 5 groups.

There is no difference between the target groups. Let's MANOVA check this group of columns. 

We see, there is no difference. Now there are 2 groups left. This group is special because it does not have an equal distribution. To see this, visualize the column.

In [None]:
_ = df[second_group[0]].hist()

This distribution is not constant, but it consists of two separate distributions that converge at zero. To make one distribution, convert the column values to absolute.

In [None]:
df['6'] = df['6'].map(abs)
df['6']

Now let's visualize this group again.

In [None]:
_ = df[second_group[0]].hist()

In [None]:
build_box(second_group[0])

In [None]:
for column in second_group:
    check_scatter(column)

We see that now the distribution has one direction and it separates the target groups better than before.

Let's now look at the correlation of this column with the target.

In [None]:
corr = np.corrcoef(df['6'], df['target'])
print(corr)

We make sure that this indicator is statistically significant for finding the target.

But knowing that it has a strong correlation we can use this. Let's increase the correlation of the squares of this indicator and check how the groups of the target indicator will be divided now.

In [None]:
df['a'] = df['6'] ** 2

corr = np.corrcoef(df['a'], df['target'])
print(corr)

In [None]:
build_box('a')

The result was better. Let's try the cube.

In [None]:
df['b'] = df['6'] ** 3

corr = np.corrcoef(df['b'], df['target'])
print(corr)

In [None]:
df['c'] = df['6'] ** 4

corr = np.corrcoef(df['c'], df['target'])
print(corr)

Conclusion: The new column 6 and a will give the best forecasts for the target.

Next let's check it out. We create a special tool.

In [None]:
def check_score(df, target=df['target']):
    X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.1, random_state=1)
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    print(f'R2: {lr.score(X_test, y_test)}')
    print(f'RMSE: {mean_squared_error(y_test, lr.predict(X_test))}')

Evaluate R2 and RMSE with one column and two.

In [None]:
check_score(df[['6']])

In [None]:
check_score(df[['6', 'a']])

In [None]:
check_score(df[['a']])

In [None]:
corr = np.corrcoef(df['6'], df['a'])
print(corr)

Conclusion: Column a is best for forecasting the target.

The next step will be to create an appropriate regression model. To do this, we will take several available models. They can be linear or not.

We separate and standardize data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df[['a']], df[['target']], test_size=0.1, random_state=1)

In [None]:
lr = LinearRegression()

normalize = [True, False]
param_grid = {'normalize': normalize}

grid_lr = GridSearchCV(estimator=lr, 
                    param_grid=param_grid, 
                    scoring='r2', 
                    verbose=0,
                    n_jobs=-1)

grid_result_lr = grid_lr.fit(X_train, y_train)

print('Best Score: ', grid_result_lr.best_score_)
print('Best Params: ', grid_result_lr.best_params_)

In [None]:
y_pred = grid_lr.predict(X_test)

print(f'RMSE: {mean_squared_error(y_test, y_pred)}')

In [None]:
ridge = Ridge()

alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
normalize = [True, False]
param_grid = {'alpha': alpha,
             'normalize': normalize}

grid_rg = GridSearchCV(estimator=ridge, 
                    param_grid=param_grid, 
                    scoring='r2', 
                    verbose=0,
                    n_jobs=-1)

grid_result_rg = grid_rg.fit(X_train, y_train)

print('Best Score: ', grid_result_rg.best_score_)
print('Best Params: ', grid_result_rg.best_params_)

In [None]:
y_pred = grid_rg.predict(X_test)

print(f'RMSE: {mean_squared_error(y_test, y_pred)}')

In [None]:
lasso = Lasso()

alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
normalize = [True, False]
param_grid = {'alpha': alpha,
             'normalize': normalize}

grid_ls = GridSearchCV(estimator=lasso, 
                    param_grid=param_grid, 
                    scoring='r2', 
                    verbose=0,
                    n_jobs=-1)

grid_result_ls = grid_ls.fit(X_train, y_train)

print('Best Score: ', grid_result_ls.best_score_)
print('Best Params: ', grid_result_ls.best_params_)

In [None]:
y_pred = grid_ls.predict(X_test)

print(f'RMSE: {mean_squared_error(y_test, y_pred)}')

Of all these models, the best is LinearRegression.

In [None]:
df_test = pd.read_csv('internship_hidden_test.csv')

df_test.head()

In [None]:
df_test['a'] = df_test['6'] ** 2

In [None]:
lr = LinearRegression(normalize=True)

X_train, y_train = df[['a']], df[['target']]
X_test = df_test[['a']]

lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [None]:
answer = pd.DataFrame(y_pred, columns=['Predict'])
answer

In [None]:
answer.to_csv('prediction.csv')