In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the dataset
df = pd.read_csv('50_Startups.csv')

In [3]:
# Explore the data
print("First few rows of the dataset:")
print(df.head())
print("\nDataset info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())

# Encode categorical variable and prepare features
X = pd.get_dummies(df.drop('Profit', axis=1), drop_first=True)
y = df['Profit']

# Identify continuous features for scaling
continuous_features = ['R&D Spend', 'Administration', 'Marketing Spend']

# Scale continuous features
scaler = StandardScaler()
X_continuous = X[continuous_features]
X_dummies = X.drop(continuous_features, axis=1)
X_continuous_scaled = scaler.fit_transform(X_continuous)
X_scaled = pd.concat([pd.DataFrame(X_continuous_scaled, columns=continuous_features), X_dummies], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("\nMean Squared Error:", mse)

First few rows of the dataset:
   R&D Spend  Administration  Marketing Spend       State     Profit
0  165349.20       136897.80        471784.10    New York  192261.83
1  162597.70       151377.59        443898.53  California  191792.06
2  153441.51       101145.55        407934.54     Florida  191050.39
3  144372.41       118671.85        383199.62    New York  182901.99
4  142107.34        91391.77        366168.42     Florida  166187.94

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB
None

Missing values:
R&D Spend          0
Administration     0
