In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

# Load the preprocessed dataset
dataset = pd.read_csv("preprocessed")

# Create dummy variables for categorical columns
dataset = pd.get_dummies(dataset, drop_first=True)

# Separate features and target variable
indep_X = dataset.drop(columns=['Salary'])
dep_Y = dataset['Salary']

# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.30, random_state=5)

# Standardize the features
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# Define the parameter grid for GridSearchCV
param_grid = {
    'criterion': ['mse', 'mae'],
    'max_features': ['auto', 'sqrt', 'log2'],
    'n_estimators': [10, 100]
}

# Perform GridSearchCV to find the best parameters for RandomForestRegressor
grid = GridSearchCV(RandomForestRegressor(), param_grid, refit=True, verbose=3, n_jobs=1)
grid.fit(x_train, y_train)


# Get the best estimator from GridSearchCV
final_model = grid.best_estimator_



  if _joblib.__version__ >= LooseVersion('0.12'):


Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] criterion=mse, max_features=auto, n_estimators=10 ...............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mse, max_features=auto, n_estimators=10, score=0.986, total=   0.3s
[CV] criterion=mse, max_features=auto, n_estimators=10 ...............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mse, max_features=auto, n_estimators=10, score=0.984, total=   0.3s
[CV] criterion=mse, max_features=auto, n_estimators=10 ...............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mse, max_features=auto, n_estimators=10, score=0.985, total=   0.3s
[CV] criterion=mse, max_features=auto, n_estimators=100 ..............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mse, max_features=auto, n_estimators=100, score=0.985, total=   2.3s
[CV] criterion=mse, max_features=auto, n_estimators=100 ..............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mse, max_features=auto, n_estimators=100, score=0.985, total=   2.7s
[CV] criterion=mse, max_features=auto, n_estimators=100 ..............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mse, max_features=auto, n_estimators=100, score=0.987, total=   2.6s
[CV] criterion=mse, max_features=sqrt, n_estimators=10 ...............
[CV]  criterion=mse, max_features=sqrt, n_estimators=10, score=0.961, total=   0.1s
[CV] criterion=mse, max_features=sqrt, n_estimators=10 ...............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mse, max_features=sqrt, n_estimators=10, score=0.968, total=   0.1s
[CV] criterion=mse, max_features=sqrt, n_estimators=10 ...............
[CV]  criterion=mse, max_features=sqrt, n_estimators=10, score=0.968, total=   0.1s
[CV] criterion=mse, max_features=sqrt, n_estimators=100 ..............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mse, max_features=sqrt, n_estimators=100, score=0.965, total=   1.0s
[CV] criterion=mse, max_features=sqrt, n_estimators=100 ..............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mse, max_features=sqrt, n_estimators=100, score=0.975, total=   1.0s
[CV] criterion=mse, max_features=sqrt, n_estimators=100 ..............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mse, max_features=sqrt, n_estimators=100, score=0.976, total=   1.0s
[CV] criterion=mse, max_features=log2, n_estimators=10 ...............
[CV]  criterion=mse, max_features=log2, n_estimators=10, score=0.950, total=   0.1s
[CV] criterion=mse, max_features=log2, n_estimators=10 ...............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mse, max_features=log2, n_estimators=10, score=0.962, total=   0.1s
[CV] criterion=mse, max_features=log2, n_estimators=10 ...............
[CV]  criterion=mse, max_features=log2, n_estimators=10, score=0.964, total=   0.1s
[CV] criterion=mse, max_features=log2, n_estimators=100 ..............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mse, max_features=log2, n_estimators=100, score=0.957, total=   1.0s
[CV] criterion=mse, max_features=log2, n_estimators=100 ..............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mse, max_features=log2, n_estimators=100, score=0.970, total=   1.0s
[CV] criterion=mse, max_features=log2, n_estimators=100 ..............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mse, max_features=log2, n_estimators=100, score=0.969, total=   1.0s
[CV] criterion=mae, max_features=auto, n_estimators=10 ...............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mae, max_features=auto, n_estimators=10, score=0.986, total=  12.3s
[CV] criterion=mae, max_features=auto, n_estimators=10 ...............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mae, max_features=auto, n_estimators=10, score=0.982, total=  12.0s
[CV] criterion=mae, max_features=auto, n_estimators=10 ...............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mae, max_features=auto, n_estimators=10, score=0.984, total=  12.1s
[CV] criterion=mae, max_features=auto, n_estimators=100 ..............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mae, max_features=auto, n_estimators=100, score=0.986, total= 2.0min
[CV] criterion=mae, max_features=auto, n_estimators=100 ..............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mae, max_features=auto, n_estimators=100, score=0.983, total= 2.1min
[CV] criterion=mae, max_features=auto, n_estimators=100 ..............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mae, max_features=auto, n_estimators=100, score=0.986, total= 2.0min
[CV] criterion=mae, max_features=sqrt, n_estimators=10 ...............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mae, max_features=sqrt, n_estimators=10, score=0.948, total=   1.9s
[CV] criterion=mae, max_features=sqrt, n_estimators=10 ...............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mae, max_features=sqrt, n_estimators=10, score=0.952, total=   1.9s
[CV] criterion=mae, max_features=sqrt, n_estimators=10 ...............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mae, max_features=sqrt, n_estimators=10, score=0.961, total=   1.9s
[CV] criterion=mae, max_features=sqrt, n_estimators=100 ..............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mae, max_features=sqrt, n_estimators=100, score=0.958, total=  21.9s
[CV] criterion=mae, max_features=sqrt, n_estimators=100 ..............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mae, max_features=sqrt, n_estimators=100, score=0.968, total=  20.7s
[CV] criterion=mae, max_features=sqrt, n_estimators=100 ..............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mae, max_features=sqrt, n_estimators=100, score=0.965, total=  21.9s
[CV] criterion=mae, max_features=log2, n_estimators=10 ...............


  if _joblib.__version__ >= LooseVersion('0.12'):
[Parallel(n_jobs=1)]: Done  31 tasks      | elapsed:  8.1min
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mae, max_features=log2, n_estimators=10, score=0.924, total=   1.9s
[CV] criterion=mae, max_features=log2, n_estimators=10 ...............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mae, max_features=log2, n_estimators=10, score=0.946, total=   1.9s
[CV] criterion=mae, max_features=log2, n_estimators=10 ...............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mae, max_features=log2, n_estimators=10, score=0.943, total=   1.7s
[CV] criterion=mae, max_features=log2, n_estimators=100 ..............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mae, max_features=log2, n_estimators=100, score=0.946, total=  17.9s
[CV] criterion=mae, max_features=log2, n_estimators=100 ..............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mae, max_features=log2, n_estimators=100, score=0.954, total=  19.8s
[CV] criterion=mae, max_features=log2, n_estimators=100 ..............


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[CV]  criterion=mae, max_features=log2, n_estimators=100, score=0.945, total=  18.1s


In [2]:
re=grid.cv_results_
#print(re)
grid_predictions = grid.predict(x_test)
# print classification report
from sklearn.metrics import r2_score
r_score=r2_score(y_test,grid_predictions)
print("The R_score value for best parameter {}:".format(grid.best_params_),r_score)

The R_score value for best parameter {'criterion': 'mse', 'max_features': 'auto', 'n_estimators': 100}: 0.985343632918796


  if _joblib.__version__ >= LooseVersion('0.12'):


In [3]:
table=pd.DataFrame.from_dict(re)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.275514,0.031485,0.006332,0.0009381339,mse,auto,10,"{'criterion': 'mse', 'max_features': 'auto', '...",0.986179,0.984242,0.985181,0.985201,0.000791,3
1,2.508789,0.172299,0.03386,0.001460041,mse,auto,100,"{'criterion': 'mse', 'max_features': 'auto', '...",0.985204,0.984631,0.98737,0.985735,0.00118,1
2,0.115918,0.007685,0.004877,0.002269447,mse,sqrt,10,"{'criterion': 'mse', 'max_features': 'sqrt', '...",0.960786,0.968099,0.968427,0.96577,0.003527,6
3,0.966597,0.013294,0.038161,0.001630544,mse,sqrt,100,"{'criterion': 'mse', 'max_features': 'sqrt', '...",0.965466,0.975062,0.975525,0.972017,0.004636,5
4,0.108024,0.005355,0.005701,0.0009158639,mse,log2,10,"{'criterion': 'mse', 'max_features': 'log2', '...",0.950309,0.962421,0.963627,0.958786,0.006014,9
5,0.980019,0.025992,0.04069,0.003445579,mse,log2,100,"{'criterion': 'mse', 'max_features': 'log2', '...",0.957184,0.970061,0.968542,0.965262,0.005745,7
6,12.129077,0.134839,0.003047,0.002975358,mae,auto,10,"{'criterion': 'mae', 'max_features': 'auto', '...",0.986106,0.981704,0.984193,0.984001,0.001802,4
7,121.048657,3.403523,0.03228,0.001450414,mae,auto,100,"{'criterion': 'mae', 'max_features': 'auto', '...",0.986088,0.983465,0.986341,0.985298,0.001301,2
8,1.913219,0.029597,0.005209,0.007366145,mae,sqrt,10,"{'criterion': 'mae', 'max_features': 'sqrt', '...",0.948378,0.952067,0.961438,0.953961,0.005497,10
9,21.455162,0.543345,0.031256,2.928658e-06,mae,sqrt,100,"{'criterion': 'mae', 'max_features': 'sqrt', '...",0.958057,0.968401,0.965308,0.963922,0.004335,8


In [8]:
# User input
age_input = float(input("Age:"))
Years_of_Experience_input = float(input("Years of Experience:"))
Gender_input = int(input("Gender (0 for Male, 1 for Female, 2 for Other):"))
Education_Level_input = int(input("Education Level (0 for Bachelor's, 1 for Bachelor's Degree, 2 for High School, 3 for Master's, 4 for Master's Degree, 5 for PhD): "))
Job_Title_level_input = int(input("Job Title Level (0 for Account Manager, 1 for Accountant, etc.):"))
Country_input = int(input("Country (0 for Canada, 1 for China, 2 for UK, 3 for USA):"))

# Get the list of all columns from your original dataset
all_columns = dataset.columns.tolist()

# Initialize an array with zeros to fill other columns
other_columns = np.zeros(len(all_columns))

# Set the user input values in the corresponding positions
other_columns[all_columns.index('Age')] = age_input
other_columns[all_columns.index('Years of Experience')] = Years_of_Experience_input
gender_mapping = {
    0: 'Gender_Male',
    1: 'Gender_Female',
    2: 'Gender_Other'
}
gender_column = gender_mapping.get(Gender_input)
if gender_column is not None:
    other_columns[all_columns.index(gender_column)] = 1
else:
    print(f"Gender mapping for input {Gender_input} not found.")

# Map Education Level input to appropriate column
education_mapping = {
    0: "Education Level_Bachelor's",
    1: "Education Level_Bachelor's Degree",
    2: "Education Level_High School",
    3: "Education Level_Master's",
    4: "Education Level_Master's Degree",
    5: "Education Level_PhD"
}
education_column = education_mapping.get(Education_Level_input)
if education_column is not None:
    other_columns[all_columns.index(education_column)] = 1
else:
    print(f"Education Level mapping for input {Education_Level_input} not found.")
# Map Education Level input to appropriate column
Job_title_mapping = Job_title_mapping = {
    0: 'Job Title_Account Manager',
    1: 'Job Title_Accountant',
    # ... (other job titles)
    'Job Title_Web Developer': 92,  # Assuming 92 is the index of the 'Job Title' column
}
Job_title_column = Job_title_mapping.get(Job_Title_level_input)
if Job_title_column is not None:
    other_columns[all_columns.index(Job_title_column)] = 1
else:
    print(f"Education Level mapping for input {Education_Level_input} not found.")
# Map Education Level input to appropriate column
# Assuming you've defined Country_mapping
Country_mapping = {
    0: 'Canada',
    1: 'China',
    2: 'UK',
    3: 'USA'
}

Country = Country_mapping.get(Country_input)
if Country is not None:
    country_column = f'Country_{Country}'
    other_columns[all_columns.index(country_column)] = 1
else:
    print(f"Country mapping for input {Country_input} not found.")

# ... (Similar mapping and setting for other inputs)

# Reshape the input features to match the model's expectations
input_features_reshaped = sc.transform([other_columns[:-1]])  # Exclude the last column

# Make a prediction using the trained final_model
Future_Prediction = final_model.predict(input_features_reshaped)

print("Future Prediction:", Future_Prediction)

Age:35
Years of Experience:14
Gender (0 for Male, 1 for Female, 2 for Other):0
Education Level (0 for Bachelor's, 1 for Bachelor's Degree, 2 for High School, 3 for Master's, 4 for Master's Degree, 5 for PhD): 5
Job Title Level (0 for Account Manager, 1 for Accountant, etc.):90
Country (0 for Canada, 1 for China, 2 for UK, 3 for USA):3
Education Level mapping for input 5 not found.
Future Prediction: [110507.64104297]


  if _joblib.__version__ >= LooseVersion('0.12'):
