In [16]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load the dataset
df = pd.read_csv('file.csv')


label_encoders = {}
categorical_features = ['Gender', 'Education Level', 'Job Title']
for col in categorical_features:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])
df.dropna(inplace=True)
#df.drop(['Age','Gender'])
X = df[['Age','Gender','Education Level', 'Job Title', 'Years of Experience']]
#X=df[['Years of Experience']]
y = df['Salary']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
#print(f'R-squared (R2): {r2:.2f}')

# You can also print the coefficients and intercept
print('Intercept:', model.intercept_)
print('Coefficients:', model.coef_)

Mean Squared Error (MSE): 885414686.25
Root Mean Squared Error (RMSE): 29755.92
Intercept: 98085.48053289721
Coefficients: [-1770.95891227  5544.55220841  2110.22173241   -28.09677576
  8780.58074274]


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

df = pd.read_csv('file.csv')


label_encoders = {}
categorical_features = ['Gender', 'Education Level', 'Job Title']
for col in categorical_features:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])


df.dropna(inplace=True)


X = df[['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience']]
y = df['Salary']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
#print(f'R-squared (R2): {r2:.2f}')

# Note: Decision Tree models don't have coefficients and an intercept in the same way linear models do.
# You can inspect the feature importances instead.
#print('Feature Importances:', model.feature_importances_)

Mean Squared Error (MSE): 107048440.95
Root Mean Squared Error (RMSE): 10346.42


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Load the dataset
df = pd.read_csv('file.csv')

# Drop rows with missing values
df.dropna(inplace=True)

# Separate features and target
X = df[['Age','Gender','Education Level', 'Job Title', 'Years of Experience']]
y = df['Salary']

# Define categorical and numerical features
categorical_features = ['Gender', 'Education Level', 'Job Title']
numerical_features = ['Age', 'Years of Experience']

# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', PolynomialFeatures(degree=2, include_bias=False), numerical_features), # Add polynomial features
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)]) # One-Hot Encode

# Create a pipeline that includes preprocessing and the model
model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', LinearRegression())])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
print(f'R-squared (R2): {r2:.2f}')

Mean Squared Error (MSE): 243994925.85
Root Mean Squared Error (RMSE): 15620.34
R-squared (R2): 0.91
