In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Point to be taken into consideration:

#### Questions:
1. Which factor influenced a candidate in getting placed?
2. Does percentage matters for one to get placed?
3. Which degree specialization is much demanded by corporate?

#### Regression Task:
Develop a simple linear regression model by estimating the model parameters. Show the regression plot. Comment on the R-square value of the model and test for significance of regression coefficient based on the summary.

In [2]:
# all the necessary libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.model_selection as model_selection
import sklearn.preprocessing as pre
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import math

# 1. Importing Data

In [3]:
main_df = pd.read_csv(r'/kaggle/input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv', index_col='sl_no')
print(f'Shape : {main_df.shape}')
main_df.head()

In [4]:
# to have a look at the general overview of the entire data
main_df.info()

In [5]:
# to get the exact number of missing values in the column.
main_df.isnull().sum()

In [6]:
# check if the status is no places
main_df['status'].value_counts()

In [7]:
# Does all the rows with missing values have a status of Not Placed?

status_df = main_df[main_df['status']=='Not Placed']
print('Shape of the dataframe:',status_df.shape, sep ='\n', end ='\n\n')
print('Number of rows with status as Not Placed:',status_df['status'].value_counts(), sep ='\n', end ='\n\n')
print('Number of rows with salary as zero:',status_df['salary'].isnull().sum(), sep ='\n', end ='\n\n')

#### Insights:
- We see that that all rows with **Status** as '*Not Placed*' have the salary as '*Nan*', which is logical.
- **Recomended change:**
    - remove the rows as they will affect the refression line. ***(should be the optimal choice)***

In [8]:
# segrigating all the numeric and categorical columns
num_cols = list(main_df.select_dtypes(exclude=['object']).columns)
cat_cols = list(main_df.select_dtypes(include=['object']).columns)
print('all the numeric cols: ',num_cols, end ='\n\n')
print('all the categorical cols:',cat_cols, end ='\n\n')

In [9]:
main_df[num_cols].describe()

In [10]:
main_df[cat_cols].describe()

# 2. Data Visualisation 

In [11]:
sns.pairplot(main_df[num_cols])

In [12]:
sns.violinplot(x = main_df['salary'])

#### Insights:
- There seems to be some multicollinearity between some of the numerical columns.
- The relation between the dependent variable and independent variables is not very clear due to the skeweness in the salary (dependent) column.
- **Violin Plot:**
    - The salary data is postively skewed and has outliers. ***(Transformation Required)***


In [13]:
#Change the figure size
plt.figure(figsize=[20, 20])

for i in range(len(cat_cols)):
    plt.subplot(4, 4, i+1)
    if(i<7):
        sns.countplot(x = main_df[cat_cols[i]], hue = main_df['status'])
    else:
        sns.countplot(x = main_df[cat_cols[i]])
plt.show()

In [14]:
# check for correlation:
sns.heatmap(main_df.corr(), annot=True)

#### Insights:
- ***hsc_p*** and ***ssc_p*** have a high correlation: ***0.51***
- ***degree_p*** and ***ssc_p*** have a high correlation: ***0.54***
- ***hsc_p*** and ***degree_p*** have a high correlation: ***0.43***
- ***mba_p*** and ***ssc_p*** have a high correlation: ***0.39***

> Removing ***hsc_p***, ***degree_p*** and ***mba_p*** and keeping ***ssc_p***, would be a good choice as ***ssc_p explains the other 2 columns as well.***

# 3. Data Transformation

## 3.1 Missing Value Treatment and Log Transformation.

In [15]:
# dropping the rows
transform_df_1 = main_df.dropna(axis= 0)
transform_df_1 = transform_df_1.drop(['hsc_p', 'degree_p', 'mba_p'], axis=1)

plt.figure(figsize=[10, 10])
plt.subplot(2, 2, 1)
sns.histplot(transform_df_1['salary'], bins=12)
plt.subplot(2, 2, 2)
sns.violinplot(x=transform_df_1['salary'])

plt.subplot(2, 2, 3)
sns.histplot(np.log(transform_df_1['salary']), bins=12)
plt.subplot(2, 2, 4)
sns.violinplot(x=np.log(transform_df_1['salary']))
plt.show()

transform_df_1['salary'] = np.log(transform_df_1['salary'])

## 3.2 Outlier Treatment

In [16]:
# IQR
Q1 = np.percentile(transform_df_1['salary'], 25,
                   interpolation = 'midpoint')
Q3 = np.percentile(transform_df_1['salary'], 75,
                   interpolation = 'midpoint')
IQR = Q3 - Q1

# Upper bound
upper = np.where(transform_df_1['salary'] >= (Q3+1.5*IQR), transform_df_1.index,-1)
upper = upper[upper > -1]
print('upper : ', upper)

# Lower bound
lower = np.where(transform_df_1['salary'] <= (Q1-1.5*IQR), transform_df_1.index,-1)
lower = lower[lower > -1]
print('lower : ', lower)

In [17]:
# Removing the Outliers
# Since there are no outliers in the lower section we don't need to write a statement for that.
print('Old Shape: ', transform_df_1.shape)
transform_df_1.drop(index = list(upper), inplace = True, axis =0)
print("New Shape: ", transform_df_1.shape)

In [18]:
# data visual after pre-processing.
plt.figure(figsize=[20, 5])
plt.subplot(1, 3, 1)
sns.histplot(transform_df_1['salary'], bins=12)
plt.subplot(1, 3, 2)
sns.violinplot(x=transform_df_1['salary'])
plt.subplot(1, 3, 3)
sns.scatterplot(x = transform_df_1.index, y=transform_df_1['salary'])
plt.show()

## 3.3 Label Encoding

In [19]:
# converting the object type values into numeric
labelencoder = pre.LabelEncoder()
for i in cat_cols:
    transform_df_1[i] = labelencoder.fit_transform(transform_df_1[i])
transform_df_1[cat_cols].head()

## 3.4 Final feature selection

In [20]:
plt.figure(figsize=[20, 5])
sns.heatmap(transform_df_1.corr(), annot=True)

#### Insights:
- Remove status as it's constant throughout 
- Remove hsc_b as it is colinear to ssc_b and has a lower correaltion with salary.
- Remove specialisation, workex, hsc_s as they have a really low contribution to the salary column.

In [21]:
transform_df_1 = transform_df_1.drop(['specialisation', 'workex', 'hsc_s', 'hsc_b', 'status'], axis=1)
transform_df_1.shape

In [22]:
plt.figure(figsize=[20, 5])
sns.heatmap(transform_df_1.corr(), annot=True)

# 4. Building a model

In [23]:
# train test split:
X = transform_df_1.drop('salary', axis=1)
y = transform_df_1['salary']

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.20, random_state = 1234)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [24]:
# changing the test values back to noraml for the dependent variable.
y_test = np.exp(y_test)

## 4.1 Building a simple Linear Regression model

In [25]:
# fitting the values to the training set
lr = LinearRegression()
lr.fit(X_train,y_train)

In [26]:
# testing the model
y_pred = lr.predict(X_test)
y_pred = np.exp(y_pred)
print('Mean Square Error:',mean_squared_error(y_test, y_pred))
print('Root Mean Square Error:', math.sqrt(mean_squared_error(y_test, y_pred)))
print('R squared :',r2_score(y_test, y_pred))

## 4.2 Step Wise Regression

In [27]:
def accuracy(col_list):
    # train test split:
    X = transform_df_1[col_list]
    y = transform_df_1['salary']

    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.20, random_state = 1234)
    
    # changing the test values back to noraml for the dependent variable.
    y_test = np.exp(y_test)
    
    # build the model
    lr = LinearRegression()
    lr.fit(X_train,y_train)
    
    # testing the model
    y_pred = lr.predict(X_test)
    y_pred = np.exp(y_pred)
    print('Mean Square Error:',mean_squared_error(y_test, y_pred))
    print('Root Mean Square Error:', math.sqrt(mean_squared_error(y_test, y_pred)))
    print('R squared :{:.4f}'.format(r2_score(y_test, y_pred)))

In [28]:
# all columns
all_cols = ['gender', 'ssc_p', 'ssc_b', 'degree_t', 'etest_p']
accuracy(all_cols)

In [29]:
# removing gender
# keep gender as the r sq value decresed
all_cols.pop(all_cols.index('gender'))
print('columns considered: ',all_cols)
accuracy(all_cols)
all_cols.append('gender')

In [30]:
# removing ssc_p
# discard ssc_p as the r sq value increased
all_cols.pop(all_cols.index('ssc_p'))
print('columns considered: ',all_cols)
accuracy(all_cols)

In [31]:
# removing ssc_b
# keep ssc_b as the r sq value decresed
all_cols.pop(all_cols.index('ssc_b'))
print('columns considered: ',all_cols)
accuracy(all_cols)
all_cols.append('ssc_b')

In [32]:
# removing degree_t
# discard degree_t as the r sq value increased
all_cols.pop(all_cols.index('degree_t'))
print('columns considered: ',all_cols)
accuracy(all_cols)

In [33]:
# removing etest_p
# keep etest_p as the r sq value decresed
all_cols.pop(all_cols.index('etest_p'))
print('columns considered: ',all_cols)
accuracy(all_cols)
all_cols.append('etest_p')

# 5. Results and Conclusions

In [34]:
print('columns considered: ',all_cols)
accuracy(all_cols)

#### Insights:
- It was prominent from the scatter plot that there is no specific trend that the dependent variable follows.
- Step wise regression filtered the columns such that the r squared value increased.
- The objective was to understand the data and model on it in the best way possible which was achieved.
