In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error 

# Load the dataset
df = pd.read_csv("salary_data.csv") 

# One-hot encode categorical features
df = pd.get_dummies(df, columns=['industry', 'location'], drop_first=True) 

# Features and target
X = df[['years_experience', 'qualification'] + [col for col in df.columns if 'industry_' in col or 'location_' in col]] 
y = df['salary'] 

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

# Train the model
model = LinearRegression() 
model.fit(X_train, y_train) 

# Predictions and evaluation
y_pred = model.predict(X_test) 
mse = mean_squared_error(y_test, y_pred) 
print("Dataset Preview:") 
print(df.head()) 
print(f"Mean Squared Error: {mse}") 

# Prediction for a new profile
new_data = pd.DataFrame({  
    'years_experience': [5], 
    'qualification': [3], 
    'industry_IT': [1], 
    'industry_Marketing': [0], 
    'location_CityB': [1], 
    'location_CityC': [0], 
}, columns=X_train.columns)  

predicted_salary = model.predict(new_data) 
print(f"Predicted salary for the new profile: {predicted_salary[0]:.2f}")

Dataset Preview:
   years_experience  qualification  salary  industry_Marketing  \
0                 1              1   40000               False   
1                 2              2   45000               False   
2                 3              2   42000                True   
3                 4              3   48000               False   
4                 5              3   47000                True   

   location_CityB  location_CityC  
0           False           False  
1            True           False  
2           False           False  
3           False            True  
4            True           False  
Mean Squared Error: 5030572.010113178
Predicted salary for the new profile: 50262.84
