In [1]:
# Import necessary libraries
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Step 1: Dataset Exploration

# Load the Iris dataset
iris = load_iris()

# Create a DataFrame from the dataset
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)

# Display the first five rows of the dataset
print("First five rows of the Iris dataset:")
print(iris_df.head())

# Get the dataset’s shape
print("\nDataset shape (rows, columns):", iris_df.shape)

# Calculate summary statistics for each feature
print("\nSummary statistics for each feature:")
print(iris_df.describe())

# Step 2: Data Splitting

# Split the Iris dataset into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

# Get the number of samples in both the training and testing sets
print("\nNumber of samples in the training set:", X_train.shape[0])
print("Number of samples in the testing set:", X_test.shape[0])

# Step 3: Linear Regression

# Simulate a dataset with YearsExperience and Salary
# Assuming YearsExperience is from 1 to 10 years
years_experience = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).reshape(-1, 1)

# Simulated corresponding salaries (in thousands)
salary = np.array([30, 35, 40, 45, 50, 60, 70, 80, 90, 100])

# Split this dataset into training and testing sets (80-20 split)
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(years_experience, salary, test_size=0.2, random_state=42)

# Fit a linear regression model
model = LinearRegression()
model.fit(X_train_lr, y_train_lr)

# Predict on the test set
y_pred_lr = model.predict(X_test_lr)

# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test_lr, y_pred_lr)

print("\nLinear Regression Model:")
print("Coefficient (Slope):", model.coef_[0])
print("Intercept:", model.intercept_)
print("Mean Squared Error on the Test Set:", mse)


First five rows of the Iris dataset:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2

Dataset shape (rows, columns): (150, 4)

Summary statistics for each feature:
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.057333           3.758000   
std             0.828066          0.435866           1.765298   
min             4.300000          2.000000           1.000000   
25%             5.100000          2.800000           1.600000   
50%             5.800000          3.00