#### step-1 import all the necessary library

- import numpy as np;
- import pandas as pd;
- import seaborn as sns;
- import matplotlib.pyplot as plt;
- import warnings;

- warnings.filterwarnings('ignore');

#### step-2 read the csv file

- df = pd.read_csv('insurance.csv');

#### step-3 shape, top 5 element ,information,describer,columns,graph,correlation

- df.shape
- df.head()
- df.info()
- df.describer()
- df.isnull().sum()
- df.columns

- numeric_columns = ['age','bmi','children','charges'];
for col in numeric_columns:
    plt.figure(figsize=(6,4))
    sns.histplot(df[col],kde = True,bins = 20)

- some count plot 
sns.countplot(x = df['children'])
sns.countplot(x = df['smoker']);
for col in numeric_columns:
    plt.figure(figsize=(6,4));
    sns.boxplot(x = df[col]); # some diagram having outlier (bmi,charges)






### step-4 data cleaning and preprocessing

- df_cleaned = df.copy();
- df_cleaned.drop_duplicates(inplace=True);

- now all the model read only 0 and 1 

##### label encoding 
1. print(df_cleaned['sex'].value_counts());
2. df_cleaned['sex'] = df_cleaned['sex'].map({"male":0,"female":1});
3. df_cleaned['smoker'].value_counts()
4. df_cleaned['smoker'] = df_cleaned['smoker'].map({"no":0,"yes":1}); # label encoding

5. rename column df_cleaned.rename(columns={'sex':'is_female','smoker':'is_smoker'},inplace=True);

##### one hot coding 
1. df['region'].value_counts()
2. df_cleaned = pd.get_dummies(df_cleaned,columns=['region'],drop_first=False);
3. df_cleaned = df_cleaned.astype(int)
 





### step-5 feature engineering and extraction 
<!-- ## Feature Engineering and extraction -->
1. sns.histplot(df['bmi']);
2. df_cleaned['bmi_category'] = pd.cut(
    df_cleaned['bmi'],
    bins=[0,18.5,24.9,29.9,float('inf')],
    labels = ['Underweight','Normal','Overweight','Obese']
);
3. df_cleaned = pd.get_dummies(df_cleaned,columns=['bmi_category'],drop_first=False);
4. if i have to drop any column df_cleaned.drop(columns=['bmi_category_Underweight','bmi_category_Normal','bmi_category_Overweight','bmi_category_Obese'], inplace=True)
5. df_cleaned = df_cleaned.astype(int);




#### step-6 scaling 

##### Scaling is important for:

1. Distance-based models (KNN, K-Means, PCA)

2. Gradient-based models (Logistic Regression, Linear Regression, Neural Networks, SVM)

3. Regularized models (Lasso, Ridge)

- Because features with larger values can dominate the model if not scaled.

##### Scaling is NOT needed much for:

1. Tree-based models (Decision Tree, Random Forest, XGBoost, LightGBM)

Because these algorithms split data based on thresholds, not distances.



Example : If age values are [20, 30, 40]
After scaling → [-1.22, 0, 1.22] (centered and rescaled).

Code:

- from sklearn.preprocessing import StandardScaler;
- cols = ['age','bmi','children'];
- scaler = StandardScaler()
- df_cleaned[cols] = scaler.fit_transform(df_cleaned[cols])
- print(df_cleaned.head());





##### step-7 Feature Extraction  (using correlation)
1. It helps you decide which features are useful for predicting your target.


- selected_feature = [
  -  'age', 'is_female', 'bmi', 'children', 'is_smoker', 'charges',
  -  'region_northwest', 'region_southeast', 'region_southwest',
  -  'bmi_category_Underweight', 'bmi_category_Normal',
  -  'bmi_category_Overweight', 'bmi_category_Obese'
- ]
- df_selected = df_cleaned[selected_feature]
- correlation_series = df_selected.corrwith(df_cleaned['charges']);
- correlation_df = correlation_series.sort_values(ascending=False);
- print(correlation_df);


##### using Categorical 

- cat_feature = [
  -  'is_female', 'is_smoker',
  -  'region_northwest', 'region_southeast', 'region_southwest',
  -  'bmi_category_Underweight', 'bmi_category_Normal',
  -  'bmi_category_Overweight', 'bmi_category_Obese'
- ]

- df_encoded = pd.get_dummies(df_cleaned[cat_feature], drop_first=False)

- cat_corr = df_encoded.corrwith(df_cleaned['charges']).sort_values(ascending=False)

- print(cat_corr)


##### step-8 
- final_df = df_cleaned[['age', 'is_female', 'bmi', 'children', 'is_smoker', 'charges','region_southeast','bmi_category_Obese']]

##### step-9 model training 

- from sklearn.model_selection import train_test_split

- x = final_df.drop('charges',axis = 1); x is without charge
- y = final_df['charges']; y is with charge 

- x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=42) # 20 test and 80  it divide the data into 2 part 20 and 80








##### step-10 Linear Regression model

- from sklearn.linear_model import LinearRegression
- model = LinearRegression();
- model.fit(x_train,y_train);  

- y_pred = model.predict(x_test); model prediction 
- y_pred

- y_test original output


##### step-11 check how much is a percent my model works 

- from sklearn.metrics import r2_score;
- r2 = r2_score(y_test,y_pred);
- r2

- n= x_test.shape[0];
- p = x_test.shape[1];

- adjusted_r2 = 1- ((1-r2)*(n-1))/(n-p-1);
- adjusted_r2;

##### these are the step i have to follow to follow in machine learning