In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
salary_information_data[['Amount']] = scaler.fit_transform(salary_information_data[['Amount']])

In [None]:
from sklearn.model_selection import train_test_split

X = salary_information_data.drop('Amount', axis=1)
y = salary_information_data['Amount']

X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the correlation matrix
corr = salary_information_data.corr()

# Plot the correlation matrix
plt.figure(figsize=(20,10))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()

# Select the relevant features
relevant_features = corr.index[abs(corr['Amount']) > 0.5]
print(relevant_features)


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Create the random forest regression model
regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the selected features
regressor.fit(X_train[relevant_features], y_train)


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Evaluate the model on the testing set
y_pred = regressor.predict(X_test[relevant_features])
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=5)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the corresponding model score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Hyperparameters:", best_params)
print("Best Score:", best_score)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Split the data into training and validation sets
train_df, val_df = train_test_split(salary_information_data, test_size=0.2, random_state=42)


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Scale the numerical variables using MinMaxScaler
scaler = MinMaxScaler()
train_df[["Amount"]] = scaler.fit_transform(train_df[["Amount"]])
val_df[["Amount"]] = scaler.transform(val_df[["Amount"]])


In [None]:
import tensorflow as tf
from tensorflow import keras

# Build a deep learning model using Keras
model = keras.Sequential([
    keras.layers.Dense(64, activation="relu", input_shape=(train_df.shape[1]-1,)),
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dense(1)
])

# Compile the model
model.compile(optimizer="adam", loss="mse")


In [None]:
# Evaluate the model on the validation data
loss = model.evaluate(val_df.drop(columns=["Amount"]), val_df["Amount"], verbose=0)
print("Mean squared error on validation data:", loss)

# Predict the salaries of the validation data
y_pred = model.predict(val_df.drop(columns=["Amount"]))

# Calculate the R2 score
from sklearn.metrics import r2_score
r2 = r2_score(val_df["Amount"], y_pred)
print("R2 score on validation data:", r2)


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

In [None]:
from sklearn.metrics import mean_absolute_error

data_encoded = pd.get_dummies(salary_information_data)
# Splitting the data into train and test sets
X = data_encoded.drop('Amount', axis=1)
y = data_encoded['Amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Training a linear regression model on the training set
reg = LinearRegression()
reg.fit(X_train, y_train)

# Predicting salaries on the test set
y_pred = reg.predict(X_test)

# Evaluating the model using mean absolute error and R^2 score
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Absolute Error:", mae)
print("R^2 Score:", r2)

In [None]:
le = LabelEncoder()
salary_information_data['Designation'] = le.fit_transform(salary_information_data['Designation'])
#salary_information_data['Education'] = le.fit_transform(salary_information_data['Education'])
salary_information_data['Company size'] = le.fit_transform(salary_information_data['Company size'])
salary_information_data['Work experience'] = le.fit_transform(salary_information_data['Work experience'])
ct = ColumnTransformer([
    ('encoder', OneHotEncoder(), [0, 1, 2, 4]),  # encode columns 0, 1, 2, 4
    ('passthrough', 'passthrough', [3, 5])  # leave columns 3, 5 as they are
])
salary_information_data = ct.fit_transform(salary_information_data)

In [None]:
X = data[:, :-1]
y = data[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lr.predict(X_test)

In [None]:
X = salary_information_data.drop(['Amount'], axis=1)
y = salary_information_data['Amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

# evaluate the model
score = model.score(X_test, y_test)
print(f'R^2 score: {score:.2f}')

In [None]:
X = salary_information_data.drop(['Amount'], axis=1)
y = salary_information_data['Amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

# Train the random forest regression model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model performance using R-squared and mean squared error
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

In [None]:
print("R-squared:", r2)
print("Mean Squared Error:", mse)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

ct = ColumnTransformer([('encoder', OneHotEncoder(), [3])], remainder='passthrough')
data = ct.fit_transform(salary_information_data)

# Split the data into training and testing sets
X = data[:, :-1]
y = data[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lr.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
print('R-squared:', r2)


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
vectorizer = CountVectorizer()
job_title_features = vectorizer.fit_transform(salary_information_data['Designation'])

In [None]:
# Apply non-negative matrix factorization to identify latent factors
matrix = salary_information_data.pivot_table(index='Designation', columns='Work experience', values='Amount')
model = NMF(n_components=5, max_iter=1000)
W = model.fit_transform(matrix)
H = model.components_

In [None]:
X = salary_information_data[['Designation', 'Work experience', 'Education']]
y = salary_information_data['Amount']
regressor = LinearRegression()
regressor.fit(X, y)

In [None]:
linear_weight = 0.4
matrix_weight = 0.3
content_weight = 0.3

In [None]:
def recommend_salary(job_title, years_of_experience, education_level, location):
    # Compute the salary prediction using linear regression
    new_job = pd.DataFrame({'Designation': job_title, 'Work Experience': years_of_experience, 
                            'Education': education_level}, index=[0])
    linear_prediction = regressor.predict(new_job)[0]

    # Compute the salary prediction using matrix factorization
    job_idx = matrix.index.get_loc(job_title)
    job_similarities = np.dot(W[job_idx], W.T)
    similar_jobs = np.argsort(-job_similarities)[1:6]
    matrix_prediction = matrix.iloc[similar_jobs].mean().mean()

    # Compute the salary prediction using content-based filtering
    job_idx = data.index[data['Designation'] == job_title][0]
    similar_jobs = cosine_similarity(job_title_features[job_idx], job_title_features)
    content_prediction = data.iloc[similar_jobs[0].argsort()[::-1][1:6]]['Amount'].mean()

    # Compute the weighted average of the salary predictions
    prediction = linear_weight * linear_prediction + matrix_weight * matrix_prediction + content_weight * content_prediction
    return prediction