<a href="https://colab.research.google.com/github/SravaniChowdaryy/Skill-Vertex-Major-Project/blob/main/Skill_Vertex_Major.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# lets import warnings module
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Loading dataset
data=pd.read_csv("/content/train.csv")

In [None]:
#Loads all columns and rows
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
#Understanding the data
data.head()

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
#Shows total number of uniques values 
data.nunique()

# CLEANING THE DATA

In [None]:
#Gives the sum of null values
data.isnull().sum()

In [None]:
#White spot in the figure represent presense of null values
plt.figure(figsize=(25,25))
sns.heatmap(data.isnull())

In [None]:
#Prints percentage of null values in particular variable or column
#axis 0(zero)=gives rows
null_var=data.isnull().sum()/data.shape[0]*100
null_var

# NUMERIC DATASET - DATA CLEANING AND PREPROCESSING

In [None]:
#Gives the variables which has numeric datavalues in it
data_num=data.select_dtypes(include=['int64','float64'])
data_num.head()

In [None]:
#Checking presence of null values in numeric dataset
#White spot in the figure represent presense of null values
plt.figure(figsize=(15,9))
sns.heatmap(data_num.isnull())

In [None]:
#Shows all the rows of numeric dataset which has mising value present in it
data_num[data_num.isnull().any(axis=1)]

In [None]:
#Gives the percentage of missing values in numeric dataset
missing_value_num_per=data_num.isnull().sum()/data_num.shape[0]*100

# lets calculate the total missing values in the data
data_num.isnull().sum()

# lets store the above two values in a dataset called missing data
missing_data = pd.concat([data_num.isnull().sum(), missing_value_num_per], axis=1, keys=['Total Missing Values', 'Percent %'])

# lets check the head of the data
missing_data

In [None]:
#Gives the List of variable names which has null values present in it
missing_num_var=[var for var in data_num.columns if data_num[var].isnull().sum()>0]
missing_num_var

In [None]:
#Prints the number of variables present in the List
print(len(missing_num_var))

In [None]:
#Data Distribution of all numeric columns
plt.figure(figsize=(15,5))
sns.set()
#enumerate function gives an item from list with its index
for i,var in enumerate(missing_num_var):
    #There are 1 columns so 1 multiplied to 1 is equal to 1
    plt.subplot(1,1,i+1)
    sns.distplot(data_num[var],bins=20,kde_kws={'linewidth':5,'color':'red'})
    

In [None]:
#Filling missing values with mean
data_num_mean=data_num.fillna(data_num.mean())
data_num_mean.isnull().sum().sum()

In [None]:
import seaborn as sns
#Data Distribution of all original numeric columns and new clean mean dataset
plt.figure(figsize=(15,5))
sns.set()
#enumerate function gives an item from list with its index
for i,var in enumerate(missing_num_var):
    #There are 1 columns so 1 multiplied to 1 is equal to 1
    plt.subplot(1,1,i+1)
    sns.distplot(data_num[var],bins=20,kde_kws={'linewidth':8,'color':'red'},label="Original Num Value Dataset")
    sns.distplot(data_num_mean[var],bins=20,kde_kws={'linewidth':3,'color':'green'},label="New Clean Mean Dataset")
    plt.legend()

In [None]:
#Filling missing values with median
data_num_median=data_num.fillna(data_num.median())
data_num_median.isnull().sum().sum()

In [None]:
#Data Distribution of all original numeric columns and new clean mean datadet and new clean median dataset
plt.figure(figsize=(25,15))
sns.set()
#enumerate function gives an item from list with its index
for i,var in enumerate(missing_num_var):
    #There are 1 columns so 1 multiplied to 1 is equal to 1
    plt.subplot(1,1,i+1)
    sns.distplot(data_num[var],bins=20,hist=False,kde_kws={'linewidth':8,'color':'red'},label="Original Num Value Dataset")
    sns.distplot(data_num_mean[var],bins=20,hist=False,kde_kws={'linewidth':5,'color':'green'},label="New Clean Mean Dataset")
    sns.distplot(data_num_median[var],bins=20,hist=False,kde_kws={'linewidth':3,'color':'black'},label="New Clean Median Dataset")
    plt.legend()

In [None]:
#Checking presence of outlier in dataset using boxplot
for i,var in enumerate(missing_num_var):
    plt.figure(figsize=(10,10))
    plt.subplot(3,1,1)
    sns.boxplot(data_num[var])
    plt.subplot(3,1,2)
    sns.boxplot(data_num_mean[var])
    plt.subplot(3,1,3)
    sns.boxplot(data_num_median[var])

In [None]:
data_concat=pd.concat([data_num[missing_num_var],data_num_mean[missing_num_var],data_num_median[missing_num_var]],axis=1,keys=
                ['Original','Mean','Median'])
#Shows all Rows which has null values in it
#axis=1 shows all rows 
data_concat[data_concat.isnull().any(axis=1)]

# CATEGORICAL DATASET-DATA CLEANING AND PREPROCESSING

In [None]:
#Gives the variables which has categorical datavalues in it
data_cat=data.select_dtypes(include=['object'])
data_cat.head()

In [None]:
#Gives missing percentage in categorical dataset
missing_value_cat_per=data_cat.isnull().mean()*100
missing_value_cat_per

In [None]:
#Gives the names of variables which has missing values in it
isnull_per=data_cat.isnull().mean()*100
miss_vars=isnull_per[isnull_per>0].keys()
miss_vars

In [None]:
#Fills the null values in education column by putting missing term
data_cat['education'].fillna("missing")

In [None]:
#Gives the mode value
data_cat['education'].mode()

In [None]:
#Gives counts
data_cat['education'].value_counts()

In [None]:
#Fills the null values with mode value 
data_cat['education'].fillna(data_cat['education'].mode()[0])

In [None]:
#Fills the null values with mode and gives the name of the mode
for var in miss_vars:
    data_cat[var].fillna(data_cat[var].mode()[0],inplace=True)
    print(var,"=",data_cat[var].mode()[0])

In [None]:
#Gives the count of null values
data_cat.isnull().sum()

In [None]:
plt.figure(figsize=(16,9))
for i,var in enumerate(miss_vars):
    #There is only variable so 1 multiplied to 1 = 1
    plt.subplot(1,1,i+1)
    plt.hist(data_cat[var],label='Imput')
    #Removes null values and plot the histogram 
    plt.hist(data[var].dropna(),label='Original')
    #It shows label
    plt.legend()

In [None]:
#Updates the original dataset with the new dataset
data.update(data_cat)
data.update(data_num_median)

In [None]:
#Checks whether the dataset is updated or not
data.isnull().sum()

THERE IS NO NULL VALUE PRESENT IN THE DATASET

In [None]:
#Label Encoding five categorical variables

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
data['education_enc']=le.fit_transform(data['education'])
data['department_enc']=le.fit_transform(data['department'])
data['region_enc']=le.fit_transform(data['region'])
data['gender_enc']=le.fit_transform(data['gender'])
data['recruitment_channel_enc']=le.fit_transform(data['recruitment_channel'])
data.head()

 Does Older Employees getting more Promotion than Younger Employees?

In [None]:
plt.figure(figsize=(15,5))
sns.lineplot(data['age'], data['is_promoted'], palette = 'winter')

#What is the Probability to get Promoted, If an employeed has won an award?

In [None]:
plt.figure(figsize=(15,5))
sns.lineplot(data['awards_won?'], data['is_promoted'], palette = 'winter')

#  What is the Average Training Score of those Employees who got Promotion?

In [None]:
plt.figure(figsize=(15,5))
sns.lineplot(data['avg_training_score'], data['is_promoted'], palette = 'winter')

#What is the Impact of Gender in Promotions?

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(data['gender'], data['is_promoted'])

# What is the Probability of Freshers getting Promoted?

In [None]:
plt.figure(figsize=(15,5))
sns.lineplot(data['length_of_service'], data['is_promoted'], palette = 'winter')

# MULTIVARIATE ANALYSIS

In [None]:
sns.heatmap(data.corr())

# DECISION TREE ALGORITHM

In [None]:
import sklearn
from sklearn.model_selection  import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

In [None]:
#Seperating the target variable
X=data[['no_of_trainings', 'length_of_service', 'avg_training_score', 'is_promoted']].values
Y=data['department']
#Spliting Dataset into Test and Train
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=100 )
#Function to perform training and Entropy
clf_entropy=DecisionTreeClassifier(criterion="entropy",random_state=100,max_depth=3,min_samples_leaf=5)
clf_entropy.fit(X_train,y_train)

In [None]:
#Function to predict
y_pred_en=clf_entropy.predict(X_test)
print(y_pred_en)

In [None]:
#Checking accuracy
print("Acurracy is = ",accuracy_score(y_test,y_pred_en)*100)