In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

# Load the preprocessed dataset
data = pd.read_csv('your_preprocessed_dataset.csv')

# Preprocess the 'Age' column
data['Age'] = data['Age'].str.extract(r'(\d+)-?(\d+)?').astype(float).mean(axis=1)

# Convert 'WorkExp' to string to handle non-string values
data['WorkExp'] = data['WorkExp'].astype(str)

# Replace non-numeric values in 'WorkExp' with a large number
data['WorkExp'] = pd.to_numeric(data['WorkExp'].str.replace('More than ', '').str.replace(' years', ''), errors='coerce')
data['WorkExp'] = data['WorkExp'].fillna(100)  # Replace NaNs with a large number

# Check for missing values
if data.isnull().sum().sum() > 0:
    data = data.dropna()

# Select features and target variable
features = ['Age', 'YearsCode', 'WorkExp']
target = 'Salary'

X = data[features].astype(float)  # Ensure features are numeric
y = data[target].astype(float)  # Ensure target variable is numeric

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')

# Save the trained model
joblib.dump(model, 'salary_prediction_model.pkl')


Mean Squared Error: 12472339400.89374
Mean Absolute Error: 49982.243992715565
R-squared: 0.05544915154788577


['salary_prediction_model.pkl']

In [18]:
import joblib
import pandas as pd

# Load the trained model
model = joblib.load('salary_prediction_model.pkl')

def predict_salary():
    # Get user input for age, years of coding experience, and work experience
    age = float(input("Enter your age: "))
    years_code = float(input("Enter your years of coding experience: "))
    work_exp = float(input("Enter your work experience: "))
    
    # Create a DataFrame with the input data
    input_data = {'Age': [age], 'YearsCode': [years_code], 'WorkExp': [work_exp]}
    input_df = pd.DataFrame(input_data)
    
    # Make salary prediction
    salary_prediction = model.predict(input_df)
    
    return salary_prediction[0]

# Predict salary based on user input
predicted_salary = predict_salary()
print(f"Predicted salary: ${predicted_salary:.2f}")


Predicted salary: $59887.90
