In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


In [None]:
#Dataset
df = pd.read_csv('/content/drive/MyDrive/Salary Data.csv')


In [None]:
print(df.head())

In [None]:
print(df.describe())

In [None]:
print(df.info())

In [None]:
print(df.isnull().sum())

In [None]:
# Impute missing values with mean for numerical columns
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Years of Experience'].fillna(df['Years of Experience'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].mean(), inplace=True)


In [None]:
# Impute missing values with mode for categorical columns
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
df['Education Level'].fillna(df['Education Level'].mode()[0], inplace=True)
df['Job Title'].fillna(df['Job Title'].mode()[0], inplace=True)

In [None]:
print(df.isnull().sum())

In [None]:
# Bar plot for average salary by years of experience
plt.figure(figsize=(12, 6))
sns.barplot(x='Years of Experience', y='Salary', data=df, ci=None)  # ci=None removes error bars
plt.xlabel('Years of Experience')
plt.ylabel('Average Salary')
plt.title('Bar Plot of Average Salary by Years of Experience')
plt.show()

In [None]:
# List of specific job titles you want to include in the plot
selected_job_titles = ['Data Analyst','Software Engineer','Sales Manager','HR Manager','Project Manager']


In [None]:
# Filter DataFrame to include only the selected job titles
filtered_df = df[df['Job Title'].isin(selected_job_titles)]

In [None]:
# Bar plot for average salary by job title
plt.figure(figsize=(12, 6))
sns.barplot(x='Job Title', y='Salary', data=filtered_df, ci=None)  # ci=None removes error bars
plt.xlabel('Job Title')
plt.ylabel('Average Salary')
plt.title('Bar Plot of Average Salary for Selected Job Titles')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [None]:
# Assuming 'df' is your DataFrame
features = ['Age', 'Years of Experience']
target = 'Salary'



In [None]:
# Splitting the data into training and testing sets
X = df[features]
y = df[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Models
models = {
    'lr': LinearRegression(),
    'lss': Lasso(),
    'rg': Ridge(),
    'knr': KNeighborsRegressor(),
    'dtr': DecisionTreeRegressor()
}


In [None]:
# Training and evaluating each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name} MSE: {mse} Score: {r2}")
