# Aim
* Identifying the target and features.
* Feature selection for each target
* Developing models for future predictions
* Creating additional features if necessary
* Model testing


In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt 

In [2]:
# reading data
df = pd.read_csv('X_train.csv')
df.head(5)

Unnamed: 0,companyId,metadata_0,industry,sector,metadata_1,metadata_2,metadata_3,metadata_4,lastUpdatedAnnumEndDate,lastUpdatedQuarterEndDate,...,Y_0_feature_22,Y_0_feature_69,Y_0_feature_108,Y_0_feature_123,Y_0_feature_80,Y_0_feature_13,Y_0_feature_20,Y_0_feature_35,Y_0_feature_45,Y_0_feature_16
0,company_18887,US,Asset Management,Financial Services,0.003054,0.490076,0.003586,0.000486,2023-12-31,2024-03-31,...,8.45842e-09,0.001202,0.0002487104,6.3e-05,6.5e-05,0.999062,1e-05,0.00134,0.00027,0.015111
1,company_14024,CA,"Furnishings, Fixtures & Appliances",Consumer Cyclical,,0.489839,1.6e-05,4.8e-05,2023-12-31,2024-03-31,...,6.552366e-06,0.001189,5.385571e-05,0.000132,0.000206,0.998454,0.000284,0.001227,0.000258,0.015089
2,company_12659,SE,Electrical Equipment & Parts,Industrials,0.007253,0.486295,1.6e-05,3.4e-05,2023-12-31,2024-03-31,...,6.899561e-07,0.001178,8.217764e-06,9.2e-05,1.2e-05,0.999051,1e-05,0.001212,0.000237,0.015103
3,company_4637,IN,Building Products & Equipment,Industrials,0.000382,,1e-06,1.4e-05,2024-03-31,2024-03-31,...,8.45842e-09,0.001176,8.871565e-07,6.3e-05,7e-06,0.999062,1e-05,0.001212,0.000235,0.01509
4,company_10738,US,Biotechnology,Healthcare,,0.485822,0.000802,2.5e-05,2023-12-31,2024-03-31,...,1.350784e-07,0.001189,8.511356e-05,6.3e-05,6e-06,0.999062,1e-05,0.001432,0.000245,0.015023


In [3]:
print(f"There are {df.shape[0]} samples in the dataset.")

There are 19733 samples in the dataset.


In [4]:
X_test = pd.read_csv('X_forward_looking.csv')
X_test.shape

(24119, 1147)

In [None]:
df.describe().T

In [None]:
# creating datasets 
num_attr = [val for val in df.columns if df[val].dtype in (int,float)]

num_data = df[num_attr]
num_data.head(5)

In [None]:
#aliging target data with respective features.
target_data = pd.read_csv('targets_train.csv')

def create_dataset(features,target):
    """Returns data with right features and target set."""
    new_data = {}
    for col in target.drop('companyId',axis=1).columns:
        col_data = pd.DataFrame()
        for column in features.columns:
            if  column.startswith('Q') or column.startswith('Y'):
                if column[-2:] == col[-2:]:           
                    col_data[column] = features[column]
            col_data[col] = target[col] 
            new_data[col] = col_data

    return new_data


dataset_dict = create_dataset(num_data,target_data)                

In [None]:
dataset_dict['target_43'].head()

In [None]:
target_7 = dataset_dict['target_7']
target_7.head()

In [None]:
def distplot(rows,columns,data,target):
    """Returns the distribution plots."""
    fig,ax = plt.subplots(nrows=rows,ncols=columns,sharey=True,figsize=(10,10))
    fig.suptitle("Distribution of data.")


    feature_data = data.drop([target],axis=1)
    axes = ax.flatten()  # flatten the 2D array of axes into a 1D array
    for i, col in enumerate(feature_data,start=-1):
        axes[i].scatter(feature_data[col], data[target])
        axes[i].set_title(col)  # set the title of each subplot to the column name
        axes[i].grid(True)
    plt.show()
    plt.tight_layout()


In [None]:
distplot(4,2,target_7,'target_7')

In [None]:
X_7 = target_7.drop(['target_7'],axis=1)
Y_7 = target_7[['target_7']]

X_7.isna().sum()

In [None]:
corr_7 = target_7.corr()
sb.heatmap(corr_7,annot=True,fmt='.3f')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import r2_score,mean_squared_error

xtrain_7,xtest_7,ytrain_7,ytest_7 = train_test_split(X_7,Y_7,test_size=0.25,random_state=12)
print(xtrain_7.shape)
print(xtest_7.shape)
print(ytrain_7.shape)
print(ytest_7.shape)

In [None]:
pls_reg = PLSRegression(n_components=4)
pls_reg.fit(X_7,Y_7)

#y7_train_pred = pls_reg.predict(xtrain_7)
#y7_test_pred = pls_reg.predict(xtest_7)

#print(f"r2 score on train set is {r2_score(ytrain_7,y7_train_pred)} and on test set is {r2_score(ytest_7,y7_test_pred)}")
#print(f"MSE score on train set is {mean_squared_error(ytrain_7,y7_train_pred)} and on test set is {mean_squared_error(ytest_7,y7_test_pred)}")

In [None]:
def test_submission(train_data,model):
    test_features = []
    for col in train_data.columns:
        test_features.append(col)

    test_data = X_test[test_features]

    test_sub = model.predict(test_data)
    return test_sub

test_7 = test_submission(X_7,pls_reg)

## target 43

In [None]:
target_43 = dataset_dict['target_43']
target_43.head()

In [None]:
distplot(4,2,target_43,'target_43')

In [None]:
X_43 = target_43.drop('target_43',axis=1)
Y_43 = target_43['target_43']

xtrain_43,xtest_43,ytrain_43,ytest_43 = train_test_split(X_43,Y_43,test_size=0.25,random_state=13)

In [None]:
pls_reg_43 = PLSRegression(n_components=4)
pls_reg_43.fit(X_43,Y_43)

#y43_train_pred = pls_reg_43.predict(xtrain_43)
#y43_test_pred = pls_reg_43.predict(xtest_43)

#print(f"r2 score on trainset is {r2_score(ytrain_43,y43_train_pred)} and on test set is {r2_score(ytest_43,y43_test_pred)}")
#print(f"MSE score on trainset is {mean_squared_error(ytrain_43,y43_train_pred)} and on test set is {mean_squared_error(ytest_43,y43_test_pred)}")

In [None]:
test_43 = test_submission(X_43,pls_reg_43)

## target 34

In [None]:
target_34 = dataset_dict['target_34']
target_34.head(5)

In [None]:
subset_col = []
for col in target_34.columns:
    if col.endswith('_34'):
        subset_col.append(col)

data_34 = target_34[subset_col]

In [None]:
distplot(4,2,data_34,'target_34')

In [None]:
X_34 = data_34.drop('target_34',axis=1)
Y_34 = data_34['target_34']

xtrain_34,xtest_34,ytrain_34,ytest_34 = train_test_split(X_34,Y_34,test_size=0.25,random_state=14)

In [None]:
pls_reg_34 = PLSRegression(n_components=4)
pls_reg_34.fit(X_34,Y_34)

#y34_pred_train = pls_reg_34.predict(xtrain_34)
#y34_pred_test = pls_reg_34.predict(xtest_34)

#print(f"r2 score for train set is {r2_score(ytrain_34,y34_pred_train)} and for test set is {r2_score(ytest_34,y34_pred_test)}")
#print(f"MSE score on train set is {mean_squared_error(ytrain_34,y34_pred_train)} and for test set is {mean_squared_error(ytest_34,y34_pred_test)}")

In [None]:
test_34 = test_submission(X_34,pls_reg_34)

## Target 4

In [None]:
target_4 = dataset_dict['target_4']
target_4.head()

In [None]:
distplot(4,2,target_4,'target_4')

In [None]:
plt.figure(figsize=(8,4))
plt.scatter(range(target_4.shape[0]),target_4['Q_3_feature_4'],s=1,c='b')
plt.xlabel('Data values')
plt.ylabel('Q_3 feature values')
plt.grid(True)
plt.show()

In [None]:
# binning the data to categorize and generate clear structure.

#for col in target_4.drop('target_4',axis=1).columns:
#    target_4[col] = pd.cut(target_4[col],8,duplicates='drop',labels=[1,2,3,4,5,6,7,8])

#target_4.head()

In [None]:
#target_4['target_4'] = pd.cut(target_4['target_4'],4,duplicates='drop')

In [None]:
# counting values and mapping the values
#for col in target_4:
#    values = target_4[col].value_counts(normalize=True)
#    target_4[col] = target_4[col].map(values)

#values = target_4.target_4.value_counts(normalize=True)
#target_4['target_4'] = target_4['target_4'].map(values)

In [None]:
distplot(4,2,target_4,'target_4')

In [None]:
# adding polynomial features
from sklearn.preprocessing import PolynomialFeatures

X_4 = target_4.drop(['target_4'],axis=1)
Y_4 = target_4['target_4']

poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_4)
X_poly_rep = np.reciprocal(X_poly)
xtrain_4,xtest_4,ytrain_4,ytest_4 = train_test_split(X_4,Y_4,test_size=0.25,random_state=23)

In [None]:
pls_reg_4 = PLSRegression(n_components=4)
pls_reg_4.fit(xtrain_4,ytrain_4)

y4_pred_train = pls_reg_4.predict(xtrain_4)
y4_pred_test = pls_reg_4.predict(xtest_4)

X_train_trans = pls_reg_4.transform(xtrain_4)
X_test_trans = pls_reg_4.transform(xtest_4)

print(f"R2 score on train set {r2_score(ytrain_4,y4_pred_train)} and test set {r2_score(ytest_4,y4_pred_test)}")
print(f"MSE score on train set {mean_squared_error(ytrain_4,y4_pred_train)} and test set {mean_squared_error(ytest_4,y4_pred_test)}")

In [None]:
from sklearn.ensemble import RandomForestRegressor

rdclf = RandomForestRegressor(max_depth = 12, n_estimators = 150, min_samples_split = 3,random_state=10)
rdclf.fit(X_4,Y_4)
#y4_pred_train_clf = rdclf.predict(xtrain_4)
#y4_pred_test_clf = rdclf.predict(xtest_4)

#print(f"R2 score on train set {r2_score(ytrain_4,y4_pred_train_clf)} and test set {r2_score(ytest_4,y4_pred_test_clf)}")
#print(f"MSE score on train set {mean_squared_error(ytrain_4,y4_pred_train_clf)} and test set {mean_squared_error(ytest_4,y4_pred_test_clf)}")


In [None]:
from sklearn.svm import SVR

svm = SVR(kernel='rbf',degree=2,C=1,tol = 0.02)
svm.fit(X_train_trans,ytrain_4)

y4_pred_svm_train = svm.predict(X_train_trans)
y4_pred_svm_test = svm.predict(X_test_trans)

print(f"R2 score on {r2_score(ytrain_4,y4_pred_svm_train)} & test set is {r2_score(ytest_4,y4_pred_svm_test)}")
print(f"MSE score on {mean_squared_error(ytrain_4,y4_pred_svm_train)} and test set is {mean_squared_error(ytest_4,y4_pred_svm_test)}")

In [None]:
test_4 = test_submission(X_4,rdclf)

## target 42

In [None]:
target_42 = dataset_dict['target_42']
target_42.head()

In [None]:
distplot(4,2,target_42,'target_42')

In [None]:
X_42 = target_42.drop('target_42',axis=1)
Y_42 = target_42['target_42']

xtrain_42,xtest_42,ytrain_42,ytest_42 = train_test_split(X_42,Y_42,test_size=0.25,random_state=16)

In [None]:
pls_reg_42 = PLSRegression(n_components=4)
pls_reg_42.fit(X_42,Y_42)

In [None]:
#y42_pred_train = pls_reg_42.predict(xtrain_42)
#y42_pred_test = pls_reg_42.predict(xtest_42)

#print(f"r2 score for train set is {r2_score(ytrain_42,y42_pred_train)} and {r2_score(ytest_42,y42_pred_test)} for test set")
#print(f"MSE score for train set is {mean_squared_error(ytrain_42,y42_pred_train)} and {mean_squared_error(ytest_42,y42_pred_test)} for test set")

In [None]:
test_42 = test_submission(X_42,pls_reg_42)

## Target 9

In [None]:
target_9 = dataset_dict['target_9']
target_9.head()

In [None]:
distplot(4,2,target_9,'target_9')

In [None]:
X_9 = target_9.drop('target_9',axis=1)
Y_9 = target_9['target_9']

xtrain_9,xtest_9,ytrain_9,ytest_9 = train_test_split(X_9,Y_9,test_size=0.25,random_state=17)

In [None]:
pls_reg_9 = PLSRegression(n_components = 4)
pls_reg_9.fit(X_9,Y_9)

#y9_pred_train = pls_reg_9.predict(xtrain_9)
#y9_pred_test = pls_reg_9.predict(xtest_9)

#print(f"r2 score for train set is {r2_score(ytrain_9,y9_pred_train)} and {r2_score(ytest_9,y9_pred_test)} for test set.")
#print(f"MSE score for train set is {mean_squared_error(ytrain_9,y9_pred_train)} and {mean_squared_error(ytest_9,y9_pred_test)} for test set.")

In [None]:
test_9 = test_submission(X_9,pls_reg_9)

## target 27

In [None]:
target_27 = dataset_dict['target_27']
target_27.head()

In [None]:
col_lst = []
for col in target_27.columns:
    if col.endswith('_27'):
        col_lst.append(col)

filtered_27 = target_27[col_lst]
filtered_27.head()

In [None]:
distplot(4,2,filtered_27,'target_27')

In [None]:
X_27 = filtered_27.drop('target_27',axis=1)
Y_27 = filtered_27['target_27']

xtrain_27,xtest_27,ytrain_27,ytest_27 = train_test_split(X_27,Y_27,test_size=0.25,random_state=16)

In [None]:
pls_reg_27 = PLSRegression(n_components=4)
pls_reg_27.fit(X_27,Y_27)

#y27_pred_train = pls_reg_27.predict(xtrain_27)
#y27_pred_test = pls_reg_27.predict(xtest_27)

#print(f"r2 score on train set {r2_score(ytrain_27,y27_pred_train)} and on test set {r2_score(ytest_27,y27_pred_test)}")
#print(f"MSE score on train set {mean_squared_error(ytrain_27,y27_pred_train)} and test set is {mean_squared_error(ytest_27,y27_pred_test)}.")

In [None]:
test_27 = test_submission(X_27,pls_reg_27)

## target 99

In [None]:
target_99 = dataset_dict['target_99']
target_99.head()

In [None]:
distplot(4,2,target_99,'target_99')

In [None]:
X_99 = target_99.drop(['target_99'],axis=1)
Y_99 = target_99['target_99']

xtrain_99,xtest_99,ytrain_99,ytest_99 = train_test_split(X_99,Y_99,test_size=0.25,random_state=18)

In [None]:
pls_reg_99 = PLSRegression(n_components=4)
pls_reg_99.fit(X_99,Y_99)

#y99_pred_train = pls_reg_99.predict(xtrain_99)
#y99_pred_test = pls_reg_99.predict(xtest_99)

#print(f"r2 score on train set is {r2_score(ytrain_99,y99_pred_train)} and on test set is {r2_score(ytest_99,y99_pred_test)}.")
#print(f"MSE score on train set is {mean_squared_error(ytrain_99,y99_pred_train)} and test set is {mean_squared_error(ytest_99,y99_pred_test)}.")

In [None]:
test_99 = test_submission(X_99,pls_reg_99)

## target 105

In [None]:
target_105 = dataset_dict['target_105']
target_105.head()

In [None]:
distplot(4,2,target_105,'target_105')

In [None]:
X_105 = target_105.drop(['target_105'],axis=1)
Y_105 = target_105['target_105']

xtrain_105,xtest_105,ytrain_105,ytest_105 = train_test_split(X_105,Y_105,test_size=0.25,random_state=18)

In [None]:
pls_reg_105 = PLSRegression(n_components=4)
pls_reg_105.fit(X_105,Y_105)

#y105_pred_train = pls_reg_105.predict(xtrain_105)
#y105_pred_test = pls_reg_105.predict(xtest_105)

#print(f"r2 score on train set is {r2_score(ytrain_105,y105_pred_train)} and on test set is {r2_score(ytest_105,y105_pred_test)}.")
#print(f"MSE score on train set is {mean_squared_error(ytrain_105,y105_pred_train)} and test set is {mean_squared_error(ytest_105,y105_pred_test)}.")

In [None]:
test_105 = test_submission(X_105,pls_reg_105)

## target 24

In [None]:
target_24 = dataset_dict['target_24']
target_24.head()

In [None]:
col_lst = []
for col in target_24.columns:
    if col.endswith('_24'):
        col_lst.append(col)

subset_24 = target_24[col_lst]
distplot(4,2,subset_24,'target_24')

In [None]:
X_24 = subset_24.drop('target_24',axis=1)
Y_24 = subset_24['target_24']

xtrain_24,xtest_24,ytrain_24,ytest_24 = train_test_split(X_24,Y_24,test_size=0.25,random_state=19)

In [None]:
pls_reg_24 = PLSRegression(n_components = 4)
pls_reg_24.fit(X_24,Y_24)

#y24_pred_train = pls_reg_24.predict(xtrain_24)
#y24_pred_test = pls_reg_24.predict(xtest_24)

#print(f"r2_score for train set is {r2_score(ytrain_24,y24_pred_train)} and test set is {r2_score(ytest_24,y24_pred_test)}.")
#print(f"MSE score for train set is {mean_squared_error(ytrain_24,y24_pred_train)} and test is {mean_squared_error(ytest_24,y24_pred_test)}.")

In [None]:
test_24 = test_submission(X_24,pls_reg_24)

## target 83

In [None]:
target_83 = dataset_dict['target_83']
target_83.head()

In [None]:
distplot(4,2,target_83,'target_83')

In [None]:
X_83 = target_83.drop('target_83',axis=1)
Y_83 = target_83['target_83']

xtrain_83,xtest_83,ytrain_83,ytest_83 = train_test_split(X_83,Y_83,test_size=0.25,random_state=20)

In [None]:
pls_reg_83 = PLSRegression(n_components=4)
pls_reg_83.fit(X_83,Y_83)

#y83_pred_train = pls_reg_83.predict(xtrain_83)
#y83_pred_test = pls_reg_83.predict(xtest_83)

#print(f"r2 score on train set is {r2_score(ytrain_83,y83_pred_train)} and test set is {r2_score(ytest_83,y83_pred_test)}.")
#print(f"MSE score on train set is {mean_squared_error(ytrain_83,y83_pred_train)} and test set is {mean_squared_error(ytest_83,y83_pred_test)}.")

In [None]:
test_83 = test_submission(X_83,pls_reg_83)

## target 80

In [None]:
target_80 = dataset_dict['target_80']
target_80.head()

In [None]:
distplot(4,2,target_80,'target_80')

In [None]:
X_80 = target_80.drop(['target_80'],axis=1)
Y_80 = target_80['target_80']

xtrain_80,xtest_80,ytrain_80,ytest_80 = train_test_split(X_80,Y_80,test_size=0.25,random_state=21)

In [None]:
pls_reg_80 = PLSRegression(n_components=4)
pls_reg_80.fit(X_80,Y_80)

#y80_pred_train = pls_reg_80.predict(xtrain_80)
#y80_pred_test = pls_reg_80.predict(xtest_80)

#print(f"r2 score on train set is {r2_score(ytrain_80,y80_pred_train)} and test set is {r2_score(ytest_80,y80_pred_test)}.")
#print(f"MSE score on train set is {mean_squared_error(ytest_80,y80_pred_test)} and test set is {mean_squared_error(ytest_80,y80_pred_test)}.")

In [None]:
test_80 = test_submission(X_80,pls_reg_80)

## target 52

In [None]:
target_52 = dataset_dict['target_52']
target_52.head()

In [None]:
distplot(4,2,target_52,'target_52')

In [None]:
X_52 = target_52.drop('target_52',axis=1)
Y_52 = target_52['target_52']

xtrain_52,xtest_52,ytrain_52,ytest_52 = train_test_split(X_52,Y_52,test_size=0.25,random_state=13)

In [None]:
pls_reg_52 = PLSRegression(n_components=4)
pls_reg_52.fit(X_52,Y_52)

#y52_pred_train = pls_reg_52.predict(xtrain_52)
#y52_pred_test = pls_reg_52.predict(xtest_52)

#print(f"r2 score on {r2_score(ytrain_52,y52_pred_train)} and test set is {r2_score(ytest_52,y52_pred_test)}.")
#print(f"MSE score on {mean_squared_error(ytrain_52,y52_pred_train)} and test set is {mean_squared_error(ytest_52,y52_pred_test)}.")

In [None]:
test_52 = test_submission(X_52,pls_reg_52)

## target 69

In [None]:
target_69 = dataset_dict['target_69']
target_69.head()

In [None]:
distplot(4,2,target_69,'target_69')

In [None]:
X_69 = target_69.drop('target_69',axis=1)
Y_69 = target_69['target_69']

xtrain_69,xtest_69,ytrain_69,ytest_69 = train_test_split(X_69,Y_69,test_size=0.25,random_state=15)

In [None]:
pls_reg_69 = PLSRegression(n_components=4)
pls_reg_69.fit(X_69,Y_69)

#y69_pred_train = pls_reg_69.predict(xtrain_69)
#y69_pred_test = pls_reg_69.predict(xtest_69)

#print(f"r2 score on train set {r2_score(ytrain_69,y69_pred_train)} and test set is  {r2_score(ytest_69,y69_pred_test)}.")
#print(f"MSE score on train set {mean_squared_error(ytrain_69,y69_pred_train)} and test set is {mean_squared_error(ytest_69,y69_pred_test)}.")

In [None]:
test_69 = test_submission(X_69,pls_reg_69)

## target 88

In [None]:
target_88 = dataset_dict['target_88']
target_88.head()

In [None]:
distplot(4,2,target_88,'target_88')

In [None]:
X_88 = target_88.drop('target_88',axis=1)
Y_88 = target_88['target_88']

xtrain_88,xtest_88,ytrain_88,ytest_88 = train_test_split(X_88,Y_88,test_size=0.25,random_state=21)

In [None]:
pls_reg_88 = PLSRegression(n_components=4)
pls_reg_88.fit(X_88,Y_88)

#y88_pred_train = pls_reg_88.predict(xtrain_88)
#y88_pred_test = pls_reg_88.predict(xtest_88)

#print(f"r2 score on train set {r2_score(ytrain_88,y88_pred_train)} and test set is {r2_score(ytest_88,y88_pred_test)}")
#print(f"MSE score on train set {mean_squared_error(ytrain_88,y88_pred_train)} and test set {mean_squared_error(ytest_88,y88_pred_test)}")



In [None]:
test_88 = test_submission(X_88,pls_reg_88)

## target 85

In [None]:
target_85 = dataset_dict['target_85']
target_85.head()

In [None]:
distplot(4,2,target_85,'target_85')

In [None]:
X_85 = target_85.drop('target_85',axis=1)
Y_85 = target_85['target_85']

xtrain_85,xtest_85,ytrain_85,ytest_85 = train_test_split(X_85,Y_85,test_size=0.25,random_state=22)

In [None]:
pls_reg_85 = PLSRegression(n_components=4)
pls_reg_85.fit(X_85,Y_85)

#y85_pred_train = pls_reg_85.predict(xtrain_85)
#y85_pred_test = pls_reg_85.predict(xtest_85)

#print(f"r2 score on train set {r2_score(ytrain_85,y85_pred_train)} and test set is {r2_score(ytest_85,y85_pred_test)}")
#print(f"MSE score on train set {mean_squared_error(ytest_85,y85_pred_test)} and test set is {mean_squared_error(ytest_85,y85_pred_test)}")

In [None]:
test_85 = test_submission(X_85,pls_reg_85)

## target 134

In [None]:
target_134 = dataset_dict['target_134']
target_134.head()

In [None]:
col_lst = []
for cols in target_134.columns:
    if cols.endswith('_134'):
        col_lst.append(cols)

subset_134 = target_134[col_lst]
distplot(4,2,subset_134,'target_134')

In [None]:
X_134 = subset_134.drop('target_134',axis=1)
Y_134 = subset_134['target_134']

xtrain_134,xtest_134,ytrain_134,ytest_134 = train_test_split(X_134,Y_134,test_size=0.25,random_state=22)

In [None]:
pls_reg_134 = PLSRegression(n_components=4)
pls_reg_134.fit(X_134,Y_134)

#y134_pred_train = pls_reg_134.predict(xtrain_134)
#y134_pred_test = pls_reg_134.predict(xtest_134)

#print(f"r2 score on train set is {r2_score(ytrain_134,y134_pred_train)} and test set is {r2_score(ytest_134,y134_pred_test)}")
#print(f"MSE score on train set is {mean_squared_error(ytrain_134,y134_pred_train)} and test set is {mean_squared_error(ytest_134,y134_pred_test)}.")

In [None]:
test_134 = test_submission(X_134,pls_reg_134)

In [None]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission.shape

In [None]:
test_set = [test_7,test_43,test_34,test_42,test_9,test_27,test_99,\
                        test_105,test_24,test_83,test_80,test_52,\
                        test_4,test_69,test_88,test_85,test_134]

series_set = []
for val in test_set:
    series_set.append(pd.Series(val.ravel()))

series_set

submission = pd.concat([sample_submission['companyId'],*series_set],axis=1,ignore_index=True)
submission.head()

In [281]:
submission.columns = sample_submission.columns

In [283]:
submission.to_csv('first_submission.csv',index=False)