In [5]:
# Preprocess Test Data and Get Predictions

# 1. Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# 2. Load Dataset
data = pd.read_csv("employee.csv")

# 3. Initial Cleanup
data = data.drop(columns=['id', 'timestamp', 'country'])

# 4. Identify Numeric and Categorical Columns
num_cols = data.select_dtypes(include=[np.number]).columns.tolist()
num_cols.remove('salary')
cat_cols = data.select_dtypes(include=['object']).columns.tolist()

# 5. Define Feature Matrix and Target Vector
X = data.drop(columns=['salary'])
y = data['salary']

# 6. Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Preprocessing Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ])

# 8. Fit and Transform Training Data
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)

# 9. Train Linear Regression Model
model = LinearRegression()
model.fit(X_train_processed, y_train)

# 10. Make Predictions on Validation Set
y_pred = model.predict(X_val_processed)

# 11. Evaluate Model
mse = mean_squared_error(y_val, y_pred)
normalized_mse = mse / np.mean(y_val)
print("Normalized MSE on Validation Set:", normalized_mse)

# ------------------------------
# Q1: Preprocess Test Data and Get Predictions

# Example test data (mocked for demonstration, replace with actual test data if available)
test_data = X_val.copy()  # Simulating test data as a copy of validation data

# Preprocess test data using the same pipeline
test_data_processed = preprocessor.transform(test_data)

# Get predictions
predictions = model.predict(test_data_processed)

# Output predictions
output = pd.DataFrame({"Predicted Salary": predictions})
print(output.head())


Normalized MSE on Validation Set: 63206.53705269099
   Predicted Salary
0      75384.615385
1     165384.615385
