### Importing necessary libraries

: 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Importing CSV files

In [None]:
df = pd.read_csv('train.csv')
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

### Dropping 'sl_no' column

In [None]:
df.drop('sl_no',axis=1,inplace=True)

### Checking for null counts

In [None]:
df.isnull().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

In [None]:
df['salary'].mean()

In [None]:
df['salary'].median()

In [None]:
plt.hist(x=df['salary'], bins=40)
plt.show()

In [None]:
df['status'].value_counts()

### Filling NaN values with zero in 'salary' column

In [None]:
df['salary'] = df['salary'].fillna(0)

In [None]:
df.isnull().sum()

### Unique Values in 'object' data type columns

In [None]:
obj = []
for col in df.columns:
    if df[col].dtype == "object":
        obj.append(col)
        print(col,df[col].unique(),len(df[col].unique()))

In [None]:
import warnings
warnings.filterwarnings("ignore")

### Gender Distribution

In [None]:
sns.histplot(data=df,x='gender')
plt.title("Gender Distribution")
plt.xticks([0,1],labels=[0,1])
plt.show()

### Pair Plot for numeric columns

In [None]:
sns.pairplot(data=df, hue='gender')
plt.show()

### Box plot of ssc_p for Mkt&HR and Mkt&Fin Specializations

In [None]:
mkt_hr_data = df[df['specialisation'] == 'Mkt&HR']
mkt_fin_data = df[df['specialisation'] == 'Mkt&Fin']
sns.boxplot(x='specialisation', y='ssc_p', data=pd.concat([mkt_hr_data, mkt_fin_data]))
plt.title('Box Plot of ssc_p for Mkt&HR and Mkt&Fin Specializations')
plt.xlabel('Specialization')
plt.ylabel('ssc_p')
plt.show()

### Box plot of hsc_p for Mkt&HR and Mkt&Fin Specializations

In [None]:
sns.boxplot(x='specialisation', y='hsc_p', data=pd.concat([mkt_hr_data, mkt_fin_data]))
plt.title('Box Plot of hsc_p for Mkt&HR and Mkt&Fin Specializations')
plt.xlabel('Specialization')
plt.ylabel('hsc_p')
plt.show()

### Box plot of degree_p for Mkt&HR and Mkt&Fin Specializations

In [None]:
sns.boxplot(x='specialisation', y='degree_p', data=pd.concat([mkt_hr_data, mkt_fin_data]))
plt.title('Box Plot of degree_p for Mkt&HR and Mkt&Fin Specializations')
plt.xlabel('Specialization')
plt.ylabel('degree_p')
plt.show()

### Box plot of etest_p for Mkt&HR and Mkt&Fin Specializations

In [None]:
sns.boxplot(x='specialisation', y='etest_p', data=pd.concat([mkt_hr_data, mkt_fin_data]))
plt.title('Box Plot of etest_p for Mkt&HR and Mkt&Fin Specializations')
plt.xlabel('Specialization')
plt.ylabel('etest_p')
plt.show()

### Box plot of mba_p for Mkt&HR and Mkt&Fin Specializations

In [None]:
sns.boxplot(x='specialisation', y='mba_p', data=pd.concat([mkt_hr_data, mkt_fin_data]))
plt.title('Box Plot of mba_p for Mkt&HR and Mkt&Fin Specializations')
plt.xlabel('Specialization')
plt.ylabel('mba_p')
plt.show()

### Placement Status Distribution with Gender

In [None]:
sns.histplot(data=df,x='status',hue='gender')
plt.title('Placement Status Distribution with Gender')
plt.xlabel('Placement Status')
plt.show()

### Placement Status Distribution with Work Experience

In [None]:
sns.histplot(data=df,x='status',hue='workex')
plt.title('Placement Status Distribution with Work Experience')
plt.xlabel('Placement Status')
plt.show()

### Pie Chart of Specialization

In [None]:
plt.pie(df['specialisation'].value_counts(), labels=df['specialisation'].value_counts().index, autopct='%1.1f%%')
plt.title("Pie chart of Specialization")
plt.show()

### Salary Distribution for Placed Candidates

In [None]:
placed_data = df[df['status'] == 'Placed']
sns.histplot(placed_data['salary'], kde=True, bins=20)
plt.title('Salary Distribution for Placed Candidates')
plt.xlabel('Salary')
plt.ylabel('Count')
plt.show()

In [None]:
cols = []
for col in df.columns:
    if df[col].dtype != "object":
        cols.append(col)

### Correlation in numerica variables

In [None]:
corr = df[cols].corr()
sns.heatmap(corr,annot=True)
plt.title('Correlation in numeric variables')
plt.show()

In [None]:
for col in df.columns:
    if df[col].dtype == "object":
        print(col,df[col].unique(),len(df[col].unique()))

### Performing One-hot encoding for categorical variables with 2 unique values

In [None]:
df['ssc_b_Central'] = df['ssc_b'].map({'Central':1,'Others':0})

In [None]:
df['hsc_b_Central'] = df['hsc_b'].map({'Central':1,'Others':0})

In [None]:
df['workex'] = df['workex'].map({'No':0,'Yes':1})

In [None]:
df['status'] = df['status'].map({'Placed':1,'Not Placed':0})

In [None]:
df['specialisation_fin'] = df['specialisation'].map({'Mkt&HR':0,'Mkt&Fin':1})

In [None]:
df.head()

In [None]:
df.drop(['ssc_b','hsc_b','specialisation'],axis=1,inplace=True)

In [None]:
df.head()

### Performing One-Hot Encoding

In [None]:
ohe = pd.get_dummies(df[['hsc_s','degree_t']],drop_first=True).astype(int)
ohe

### Concatinating our dataframe

In [None]:
df1 = pd.concat([ohe,df.drop(['hsc_s','degree_t'],axis=1)],axis=1)
df1.head()

### Correlation of Features with Target Variable

In [None]:
corr1 = df1.corr()
sns.heatmap(corr1,annot=True)
plt.title('Correlation of Features with Target Variable')
plt.show()

In [None]:
correlated_variable = corr1['salary'].abs().sort_values(ascending=False)

In [None]:
correlated_variable

### Independent Variables

In [None]:
X = df1.drop('salary',axis=1)
X

### Dependent Variables

In [None]:
y = df1['salary']

In [None]:
y

### Splitting data into Training set and Testing set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
X_train

### Performing Normalization

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Training the model

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train_scaled,y_train)
reg.score(X_test_scaled,y_test)

### Finding best model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

In [None]:
algos = {
    'linear_regression': {
        'model': LinearRegression(),
        'params': {

        }
    },
    'lasso': {
        'model': Lasso(max_iter=100000),
        'params': {
            'alpha': [1,2],
            'selection': ['random','cyclic']
        }
    },
    'ridge': {
        'model': Ridge(max_iter=100000),
        'params': {
            'alpha': [1,2]
        }
    },
    'decision_tree': {
        'model': DecisionTreeRegressor(),
        'params': {
            'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
            'splitter': ['best','random']
        }
    },
    'svr': {
        'model': SVR(max_iter=10000000),
        'params': {
         
        }
    },
    'random_forest': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [1,5,10,20,50]
        }
    },
    'gradient_boosting': {
        'model': GradientBoostingRegressor(),
        'params': {
            'n_estimators': [1,5,10,20,50],
            'learning_rate': [0.001,0.01,0.1,0.5]
        }
    }
}

In [None]:
X_scaled = scaler.transform(X)

In [None]:
scores= []
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=10)
for algo_name, mp in algos.items():
   reg = GridSearchCV(mp['model'], mp['params'], cv=cv, return_train_score=False)
   reg.fit(X_scaled, y)
   scores.append(
      {
         'model': algo_name,
         'best_score': reg.best_score_,
         'best_params': reg.best_params_
      }
   ) 

In [None]:
score = pd.DataFrame(scores, columns=['model','best_score','best_params'])
score

### Training Ridge model

In [None]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1)
ridge.fit(X_train_scaled,y_train)
ridge.score(X_test_scaled,y_test)

In [None]:
import pickle

### Saving our model

In [None]:
with open('model.pkl','wb') as f:
    pickle.dump(ridge,f)

### Saving our scaler

In [None]:
with open('scaler.pkl','wb') as f:
    pickle.dump(scaler,f)