In [23]:
# Importing numpy for numerical operations
import numpy as np  

# Importing pandas for data manipulation and analysis
import pandas as pd 

# Importing matplotlib for creating visualizations
import matplotlib.pyplot as plt 

# Importing seaborn for creating complex data visualizations, built on top of matplotlib
import seaborn as sns 

# Importing LinearRegression model from sklearn for building a linear regression model
from sklearn.linear_model import LinearRegression 

from sklearn.decomposition import PCA

# Importing metrics from sklearn for evaluating model performance
from sklearn.metrics import mean_squared_error,r2_score 

# Importing train_test_split from sklearn for splitting data into training and testing sets
from sklearn.model_selection import train_test_split  

# Importing StandardScaler from sklearn to standardize features by removing the mean and scaling to unit variance
from sklearn.preprocessing import StandardScaler


In [5]:
df = pd.read_csv('total_data_na.csv')

In [6]:
# Displaying the first few rows of the dataset
print("First few values: ")
df.head()

First few values: 


Unnamed: 0,PLAYER,Mat.x,Inns.x,NO,Runs.x,HS,Avg.x,BF,SR.x,X100,...,Ov,Runs.y,Wkts,BBI,Avg.y,Econ,SR.y,X4w,X5w,y
0,Aaron Finch,10,9,1,134,46,16.75,100,134.0,0,...,0.0,0,0,0,0,0.0,0,0,0,0
1,AB de Villiers,12,11,2,480,90,53.33,275,174.54,0,...,0.0,0,0,0,0,0.0,0,0,0,0
2,Abhishek Sharma,3,3,2,63,46,63.0,33,190.9,0,...,0.0,0,0,0,0,0.0,0,0,0,0
3,Ajinkya Rahane,15,14,1,370,65,28.46,313,118.21,0,...,0.0,0,0,0,0,0.0,0,0,0,0
4,Alex Hales,6,6,0,148,45,24.66,118,125.42,0,...,0.0,0,0,0,0,0.0,0,0,0,0


In [7]:
# Displaying the information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143 entries, 0 to 142
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   PLAYER  143 non-null    object 
 1   Mat.x   143 non-null    int64  
 2   Inns.x  143 non-null    int64  
 3   NO      143 non-null    int64  
 4   Runs.x  143 non-null    int64  
 5   HS      143 non-null    int64  
 6   Avg.x   143 non-null    object 
 7   BF      143 non-null    int64  
 8   SR.x    143 non-null    float64
 9   X100    143 non-null    int64  
 10  X50     143 non-null    int64  
 11  X4s     143 non-null    int64  
 12  X6s     143 non-null    int64  
 13  Mat.y   143 non-null    int64  
 14  Inns.y  143 non-null    int64  
 15  Ov      143 non-null    float64
 16  Runs.y  143 non-null    int64  
 17  Wkts    143 non-null    int64  
 18  BBI     143 non-null    int64  
 19  Avg.y   143 non-null    object 
 20  Econ    143 non-null    float64
 21  SR.y    143 non-null    object 
 22  X4

In [9]:
#step 2
X=df.drop(columns=['PLAYER','Runs.x'])
Y=df['Runs.x']

In [13]:
#step 3
non_numeric_columns=X.columns[X.apply(lambda col: pd.to_numeric(col,errors='coerce').isna().any())]
X[non_numeric_columns]=X[non_numeric_columns].replace('-',pd.NA)

''' 
X.apply(lambda col: pd.to_numeric(col, errors='coerce').isna().any()):

   - X.apply() applies a function to each column of the DataFrame X.
   - lambda col: defines an anonymous function that takes one argument, col. This argument represents each column of the DataFrame X.
   - pd.to_numeric(col, errors='coerce') attempts to convert each column to numeric values. If conversion fails for any value, errors='coerce' ensures those values are set to NaN (Not a Number).
   = .isna().any() checks if there are any NaN values in the column. If there are, it means the column contains some non-numeric values.

X.columns[...] retrieves the column names that have NaN values, indicating they are non-numeric.



X[non_numeric_columns] = X[non_numeric_columns].replace('-', pd.NA):

- X[non_numeric_columns] selects the columns that were identified as non-numeric.
- replace('-', pd.NA) replaces any occurrences of the string '-' with pd.NA (Pandas' representation of missing values).
'''

In [14]:
#step 4 combination
file_cleaned=pd.concat([X,Y],axis=1).dropna()

'''
- pd.concat([X, Y], axis=1) combines the DataFrames X and Y along the columns (horizontally).
- axis=1 specifies that the concatenation should be done along the columns.
- This results in a new DataFrame where the columns of Y are added to the columns of X.
- dropna() removes any rows in the concatenated DataFrame that contain missing values (NaN or pd.NA).
'''

In [15]:
X_cleaned=file_cleaned.drop(columns=['Runs.x'])
Y_cleaned=file_cleaned['Runs.x']

'''
- file_cleaned.drop(columns=['Runs.x']) creates a new DataFrame by dropping the column named 'Runs.x' from file_cleaned.
- This new DataFrame (X_cleaned) will include all columns from file_cleaned except 'Runs.x'.
- Purpose: Typically, X_cleaned represents the feature set for a machine learning model, so 'Runs.x' (which seems to be a target variable or label) is removed from the features.

- file_cleaned['Runs.x'] selects the column 'Runs.x' from file_cleaned.
- This column is assigned to Y_cleaned.
- Purpose: Y_cleaned represents the target variable or label that you want to predict using the features in X_cleaned.

In [17]:
#step 5 (splitting)
X_train,X_test,Y_train,Y_test=train_test_split(X_cleaned,Y_cleaned,test_size=0.2,random_state=42)

In [18]:
#Standardization
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test) 

'''scaler.fit_transform(X_train) performs two operations:

    fit: Computes the mean and standard deviation for each feature in the training data X_train.
    transform: Scales the training data based on the computed mean and standard deviation. '''

In [19]:
#Apply PCA
pca=PCA(n_components=0.95)  #preserving 95% of data for variance
X_train_pca=pca.fit_transform(X_train_scaled)
X_test_pca=pca.transform(X_test_scaled)

In [20]:
#training the model
model=LinearRegression()
model.fit(X_train_pca,Y_train)

In [28]:
#prediction for linear

y_pred=model.predict(X_test_pca)
mse=mean_squared_error(Y_test,y_pred)
r2 = r2_score(Y_test, y_pred)

In [30]:
print("number of Components: ",pca.n_components_)
print("MSE: ",mse)
print("r2_score: ",r2)

number of Components:  9
MSE:  316.698348365652
r2_score:  0.9917986838512697
