Convert categorical variables (Education Level, Job Level, Department) into numerical values using one-hot encoding or label encoding.

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [4]:
df = pd.read_csv('NewDataset.csv')
print(df.head())

   experience  test_score(out of 10)  interview_score(out of 10)    salary($)  \
0         7.0                   10.0                         3.0  30165.13401   
1        20.0                    3.0                         5.0  51861.98321   
2        15.0                    3.0                         6.0  41332.07574   
3        11.0                    4.0                         9.0  32120.10096   
4         8.0                    7.0                         5.0  31400.55135   

  education_level previous_company     skills       location  
0          Master        Company H  Python, R    Los Angeles  
1        Bachelor        Company E     SQL, R        Chicago  
2          Master        Company A     R, SQL    Los Angeles  
3          Master        Company B     R, SQL  San Francisco  
4          Master        Company D     SQL, R    Los Angeles  


In [5]:
print(df.columns)

Index(['experience', 'test_score(out of 10)', 'interview_score(out of 10)',
       'salary($)', 'education_level', 'previous_company', 'skills',
       'location'],
      dtype='object')


In [6]:
print(df.dtypes)

experience                    float64
test_score(out of 10)         float64
interview_score(out of 10)    float64
salary($)                     float64
education_level                object
previous_company               object
skills                         object
location                       object
dtype: object


In [7]:
#df.isna().sum()
# Step 2: Check for missing values
missing_values = df.isna().sum()
print(missing_values)

experience                    3
test_score(out of 10)         3
interview_score(out of 10)    1
salary($)                     0
education_level               0
previous_company              0
skills                        0
location                      0
dtype: int64


In [8]:
# Fill missing values with the mean of the respective column
df['test_score(out of 10)'].fillna(df['test_score(out of 10)'].mean(), inplace=True)
df['interview_score(out of 10)'].fillna(df['interview_score(out of 10)'].mean(), inplace=True)

# Verify that there are no more missing values
missing_values_after = df.isna().sum()
print(f"Missing values after filtering:\n{missing_values_after}")

Missing values after filtering:
experience                    3
test_score(out of 10)         0
interview_score(out of 10)    0
salary($)                     0
education_level               0
previous_company              0
skills                        0
location                      0
dtype: int64


In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
#creating instance of one hot encoder
Onehotencoder = OneHotEncoder()

class sklearn.preprocessing.OneHotEncoder(n_values='auto', categorical_features='Education Level', 'Job Level', 'Department', dtype=<type 'int'>, sparse=True, handle_unknown='error')

In [10]:
# Initialize LabelEncoders for each categorical column
le_education = LabelEncoder()
le_company = LabelEncoder()
le_location = LabelEncoder()

In [11]:
# Apply LabelEncoder to each column
df['education_level'] = le_education.fit_transform(df['education_level'])   
df['previous_company'] = le_company.fit_transform(df['previous_company'])
df['location'] = le_location.fit_transform(df['location'])                                                  

In [12]:
# Print the transformed DataFrame
print(df.head(5))

   experience  test_score(out of 10)  interview_score(out of 10)    salary($)  \
0         7.0                   10.0                         3.0  30165.13401   
1        20.0                    3.0                         5.0  51861.98321   
2        15.0                    3.0                         6.0  41332.07574   
3        11.0                    4.0                         9.0  32120.10096   
4         8.0                    7.0                         5.0  31400.55135   

   education_level previous_company     skills       location  
0                1        Company H  Python, R    Los Angeles  
1                0        Company E     SQL, R        Chicago  
2                1        Company A     R, SQL    Los Angeles  
3                1        Company B     R, SQL  San Francisco  
4                1        Company D     SQL, R    Los Angeles  


In [103]:
# Apply OneHotEncoder to 'skills' column
df = df.join(df['skills'].str.get_dummies(sep=', '))

# Drop original 'skills' column as it's now one-hot encoded
df.drop(columns=['skills'], inplace=True)

# Normalize numerical features
scaler = StandardScaler()
df[['experience', 'test_score(out of 10)', 'interview_score(out of 10)']] = scaler.fit_transform(
    df[['experience', 'test_score(out of 10)', 'interview_score(out of 10)']]
)

print(df.head())

   experience  test_score(out of 10)  interview_score(out of 10)    salary($)  \
0   -0.484584               1.589838                   -0.859014  30165.13401   
1    1.759575              -0.798383                   -0.184680  51861.98321   
2    0.896437              -0.798383                    0.152488  41332.07574   
3    0.205926              -0.457208                    1.163990  32120.10096   
4   -0.311957               0.566315                   -0.184680  31400.55135   

   education_level  previous_company  location  Python  R  SQL  
0                1                 7         1       1  1    0  
1                0                 4         0       0  1    1  
2                1                 0         1       0  1    1  
3                1                 1         3       0  1    1  
4                1                 3         1       0  1    1  


Normalize or scale the numerical features (Years of Experience) if necessary

In [104]:
from sklearn.model_selection import train_test_split

# Define the features and the target variable
X = df.drop(columns=['salary($)'])
y = df['salary($)']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape)


(160, 9) (40, 9)


In [106]:
print(df.isnull().sum())


experience                    3
test_score(out of 10)         0
interview_score(out of 10)    0
salary($)                     0
education_level               0
previous_company              0
location                      0
Python                        0
R                             0
SQL                           0
dtype: int64


In [107]:
print(df.describe())


         experience  test_score(out of 10)  interview_score(out of 10)  \
count  1.970000e+02           2.000000e+02                2.000000e+02   
mean  -8.791614e-17          -1.776357e-16               -1.776357e-17   
std    1.002548e+00           1.002509e+00                1.002509e+00   
min   -1.520350e+00          -1.480732e+00               -1.533349e+00   
25%   -8.298395e-01          -7.983828e-01               -8.590142e-01   
50%   -1.393289e-01          -1.160339e-01               -1.846796e-01   
75%    8.964370e-01           9.074893e-01                8.268224e-01   
max    1.759575e+00           1.589838e+00                1.501157e+00   

          salary($)  education_level  previous_company    location  \
count    200.000000        200.00000        200.000000  200.000000   
mean   33501.700093          0.92500          3.520000    1.505000   
std    13794.035747          0.80786          2.283786    1.098183   
min     3729.816420          0.00000          0.00000

In [108]:
print(df.dtypes)


experience                    float64
test_score(out of 10)         float64
interview_score(out of 10)    float64
salary($)                     float64
education_level                 int64
previous_company                int64
location                        int64
Python                          int64
R                               int64
SQL                             int64
dtype: object


In [110]:
import numpy as np
print(df.isnull().sum())
print(df.isin([np.nan, np.inf, -np.inf]).sum())


experience                    3
test_score(out of 10)         0
interview_score(out of 10)    0
salary($)                     0
education_level               0
previous_company              0
location                      0
Python                        0
R                             0
SQL                           0
dtype: int64
experience                    3
test_score(out of 10)         0
interview_score(out of 10)    0
salary($)                     0
education_level               0
previous_company              0
location                      0
Python                        0
R                             0
SQL                           0
dtype: int64


In [111]:
print(df.duplicated().sum())


0


In [113]:
print(df['salary($)'].describe())


count      200.000000
mean     33501.700093
std      13794.035747
min       3729.816420
25%      23004.689272
50%      33130.535795
75%      44189.078817
max      69084.437720
Name: salary($), dtype: float64


In [114]:
from sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model using the training data
model.fit(X_train, y_train)

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values