In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from google.colab import drive
from scipy import stats
from scipy.stats.stats import pearsonr
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression

In [None]:
%matplotlib inline
warnings.filterwarnings("ignore")
pd.set_option('display.expand_frame_repr', False)

In [None]:
drive.mount('/content/drive')

In [None]:
path = "/content/drive/My Drive/data.csv"

In [None]:
df = pd.read_csv(path)

In [None]:
print ("Total number of rows in dataset = {}".format(df.shape[0]))
print ("Total number of columns in dataset = {}".format(df.shape[1]))

In [None]:
df.head()

In [None]:
# Split df into x and Y
target_col = "Y"
X = df.loc[:, df.columns != target_col]
y = df.loc[:, target_col]

In [None]:
X.shape

In [None]:
y.shape

In [None]:
# Split the data into train and test with 70% data being used for training
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.30, 
                                                    random_state=42)

In [None]:
X_train.head()

In [None]:
X_new = SelectKBest(f_regression, k=2).fit_transform(X_train, y_train)

In [None]:
X_new[0:5]

# Relationship of Features with Response Variables

In [None]:
def plot_join_plot(df, feature, target):
    j = sns.jointplot(feature, target, data = df, kind = 'reg')
    j.annotate(stats.pearsonr)
    return plt.show()

In [None]:
train_df = pd.concat([X_train, y_train], axis=1)

In [None]:
plot_join_plot(train_df, "X1", target_col)

In [None]:
plot_join_plot(train_df, "X2", target_col)

In [None]:
plot_join_plot(train_df, "X3", target_col)

In [None]:
plot_join_plot(train_df, "X4", target_col)

# Correlation Analysis using Pearson Analysis

In [None]:
pearsonr(X_train["X4"], y_train)

In [None]:
out_list = []
for column in X_train.columns:
    corr_tuple = pearsonr(X_train[column], y_train)
    out_list.append([column, corr_tuple[0], corr_tuple[1]])

In [None]:
corr_df = pd.DataFrame(out_list, columns=["Features", "Correlation", "P-Value"])

In [None]:
corr_df.head()

In [None]:
corr_df.sort_values(by=['P-Value'], inplace=True)

In [None]:
corr_df.head()