# Loading the libraries

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# Loading the dataset

In [28]:
data = pd.read_csv('EmployeeSurvey.csv')


# Intitial exploration of the data

In [33]:
print("First 5 rows⬇️:\n")
print(data.head())
print("\nLast 5 rows⬇️:\n")
print(data.tail())
print("\nGeneral information about the dataset⬇️:\n")
print(data.info())
print(f"Numbers of rows and columns in the dataset: {data.shape}")
print(f"\nRows with at least one null value: {data.isnull().any(axis=1).sum()}")


First 5 rows⬇️:

   Emp ID  satisfaction_level  last_evaluation  number_project  \
0     1.0                0.38             0.53             2.0   
1     2.0                0.80             0.86             5.0   
2     3.0                0.11             0.88             7.0   
3     4.0                0.72             0.87             5.0   
4     5.0                0.37             0.52             2.0   

   average_montly_hours  time_spend_company  Work_accident  \
0                 157.0                 3.0            0.0   
1                 262.0                 6.0            0.0   
2                 272.0                 4.0            0.0   
3                 223.0                 5.0            0.0   
4                 159.0                 3.0            0.0   

   promotion_last_5years   dept  salary  
0                    0.0  sales     low  
1                    0.0  sales  medium  
2                    0.0  sales  medium  
3                    0.0  sales     low  
4  

# Improving the quality of data
###### Since there are not much null values in comparison to the general size of the dataset - the best solution for the precision of the future predictions will be simply from them.

In [30]:
data = data.dropna()
print(f"\nRows with at least one null value: {data.isnull().any(axis=1).sum()}")


Rows with at least one null value: 0


### I'm interested to explore just the sales department, as it is one of the most numerous in the dataset, and different professions usually have different career  patterns, what will decrease the predictions accuracy significantly.

In [40]:
is_sales_department = data['dept'] == 'sales'
sales_data = data[is_sales_department]
# Getting rid of the "dept" column, since we will explore just sales department
sales_data.drop('dept', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales_data.drop('dept', axis=1, inplace=True)


### Get target (dependent) variable into the separate Pandas Dataframe, dropping it in the main sales' dataset which will for the features (independent variables) 

In [41]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
target_var = 'satisfaction_level'
Y = sales_data[target_var]
X = sales_data.drop([target_var], axis=1)

# Divide data into train & test

In [42]:
from sklearn.model_selection import train_test_split
X['salary'] = le.fit_transform(X['salary'])
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=101)

#### Ensuring variables were splited correctly

In [43]:
print(f"X_train: {X_train.shape}")
print(f"Y_train: {Y_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"Y_test: {Y_test.shape}")

X_train: (3312, 8)
Y_train: (3312,)
X_test: (828, 8)
Y_test: (828,)


# Building & training the basic model

In [45]:

rf_Model = RandomForestClassifier()
rf_Model.fit(X_train,Y_train)

ValueError: could not convert string to float: 'low'