In [None]:
import pandas as pd
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

data = pd.read_csv("/kaggle/input/salary-at-30-years-of-age/salary_at_30_years_of_age.csv")
df = pd.DataFrame(data)

df = df.drop(['Unnamed: 0'], axis=1)

df['Skills'] = df['Skills'].apply(ast.literal_eval)
print(df['Skills'])

mlb = MultiLabelBinarizer()
skills_encoded = pd.DataFrame(mlb.fit_transform(df['Skills']), columns=mlb.classes_)

df.drop(['Skills'], axis=1, inplace=True)
df = pd.concat([df, skills_encoded], axis=1)

X = df.drop(['Salary_at_30'], axis=1)
y = df['Salary_at_30']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train,X_test,y_train,y_test = train_test_split(X_scaled, y, random_state = 42, test_size = 0.3)

corr = df.corr()['Salary_at_30'].sort_values(ascending = False)
print(corr)

for col in df.columns:
    if col != "Salary_at_30":
        plt.figure(figsize=(4,3))                 
        sns.scatterplot(x=col, y="Salary_at_30", data=df)
        plt.title(f"Salary vs {col}")
        plt.tight_layout()                        
        plt.show()
        plt.close()

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('r2_score: ', round(r2_score(y_test, y_pred),2))
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print('MAE: ', mae)
print('MSE: ', mse)

rmse = np.sqrt(mse)
print('RMSE: ', rmse)