In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

In [2]:
training_data = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')
training_data

In [3]:
training_data.head()

In [4]:
training_data.info()

In [5]:
training_data.describe()

In [6]:
training_data.corr()

In [7]:
plt.figure(figsize=(20,15))
sns.heatmap(training_data.corr(), annot=True)

In [8]:
training_data.corr().unstack()

In [9]:
feature_corr = training_data.corr().unstack().sort_values()
feature_corr

In [10]:
feature_corr[(feature_corr>0.7)&(feature_corr<1)]

In [11]:
print(feature_corr[(abs(feature_corr)>0.7) & (abs(feature_corr)<1)].drop_duplicates())

In [12]:
training_data = training_data.drop(columns=['id'])

In [13]:
training_data.columns[training_data.dtypes!='object']

In [14]:
num_feature = training_data.columns[training_data.dtypes!='object']

def my_plot(feature):
    plt.hist(training_data[feature])
    plt.title(feature)
    plt.show()
        
for i in num_feature:
    my_plot(i)

In [15]:
training_data['cont5'].describe()

In [16]:
def my_plot(feature):
    plt.boxplot(training_data[feature],vert = False)
    plt.title(feature)
    plt.show()
        
for i in num_feature:
    my_plot(i)

In [17]:
training_data_clear = pd.get_dummies(training_data, drop_first=True)

In [18]:
training_data_clear.head()

In [19]:
training_data_clear.info()

In [20]:
x = training_data_clear.drop('target', axis=1)
y = pd.DataFrame(training_data_clear['target'])

In [21]:
x.head()

In [22]:
y.head()

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)

In [24]:
print(x_train.shape)
x_train.head()

In [25]:
print(y_train.shape)
y_train.head()

In [26]:
x_test.shape, y_test.shape

In [27]:
scaler_x = MinMaxScaler().fit(x_train)
scaler_y = MinMaxScaler().fit(y_train)

In [28]:
x_train_sc = scaler_x.transform(x_train)
x_test_sc = scaler_x.transform(x_test)

In [29]:
y_train_sc = scaler_y.transform(y_train)
y_test_sc = scaler_y.transform(y_test)

In [30]:
x_train.columns

In [31]:
y_train

In [32]:
y_train_sc

In [33]:
lr_model = LinearRegression()
lr_model.fit(x_train_sc, y_train_sc)
y_pred_sc = lr_model.predict(x_test_sc)

In [34]:
y_test_sc

In [35]:
y_pred_sc

In [36]:
mae = mean_absolute_error(y_test_sc, y_pred_sc)
rmse = np.sqrt(mean_squared_error(y_test_sc, y_pred_sc))

print('MAE = ', mae.round(4))
print('RMSE = ', rmse.round(4))

In [37]:
y_test_inv = scaler_y.inverse_transform(y_test_sc.reshape(-1,1))
y_pred_inv = scaler_y.inverse_transform(y_pred_sc.reshape(-1,1))

actual_mae = mean_absolute_error(y_test_inv, y_pred_inv)
actual_rmse = np.sqrt(mean_squared_error(y_test_inv, y_pred_inv))

print('Actual MAE = ', int(actual_mae))
print('Actual RMSE = ', int(actual_rmse))

In [38]:
y_test_inv

In [39]:
y_pred_inv

In [40]:
testing_data = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')
testing_data

In [41]:
testing_data_clear = testing_data.drop(columns=['id'])
testing_data_clear

In [42]:
testing_data_clear = pd.get_dummies(testing_data_clear, drop_first=True)

In [43]:
testing_data_clear.head()

In [44]:
testing_data_clear.insert(loc=30,
          column='cat6_G',
          value=0)

In [45]:
testing_data_clear.columns

In [46]:
testing_data_scaled =scaler_x.transform(testing_data_clear)

In [47]:
testing_data_pred_sc = lr_model.predict(testing_data_scaled)

In [48]:
testing_data_pred_inv = scaler_y.inverse_transform(testing_data_pred_sc.reshape(-1,1))


In [49]:
testing_data_pred_inv

In [50]:
output = pd.DataFrame()
output['id']=testing_data.id
output['target']=testing_data_pred_inv
output.to_csv('submission.csv')
output.set_index('id').to_csv('submission.csv')