# INTERNSHIP TASK-1

## DATA PIPELINE DEVELOPMENT

### Task- Create a pipeline for data preprocessing,transformation, and loading using tools like pandas and scikit-learn

### Dataset link:https://data.world/kudem/heart-disease-dataset/workspace/file?filename=heart_data.csv

### Import required libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

#### Here we used heart data for doing this task

### Load the dataset

In [None]:
df = pd.read_csv("/content/heart_data.csv")
df

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


#### This dataset consists of 70000 rows and 13 columns

In [None]:
#no of rows and columns
df.shape

(70000, 13)

In [None]:
#display first 5 rows
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


### Basic data understanding

In [None]:
print("\nDataset info:")
print(df.info())


Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB
None


In [None]:
print("\nmissing values in each column:")
print(df.isnull().sum())


missing values in each column:
id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64


#### There is no missing values in the dataset

### Identify numeric and categorical columns

In [None]:
Num_cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
Cat_cols = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']


### Define Preprocessing Steps

In [None]:
# numeric transformation :impute missing values-scale
numeric_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

#categorical transformation :impute missing values-one-hot-encode
categorical_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore'))
])

#combine both into columntransformer
preprocessor=ColumnTransformer(
    transformers=[
        ('num',numeric_transformer,Num_cols),
        ('cat',categorical_transformer,Cat_cols)
    ]
)

### Apply preprocessing + transformation

In [None]:
X_transformed = preprocessor.fit_transform(df)

print("\nData Preprocessing & Transformation Completed!")
print("Transformed Shape:", X_transformed.shape)


Data Preprocessing & Transformation Completed!
Transformed Shape: (70000, 19)


### Feature and Target split

In [None]:
target_col = 'cardio'  #here target column is cardio
X = df.drop(columns=[target_col]) #drop target column and X variable contains features except target column
y = df['cardio'] # y variable contains target column

In [None]:
X

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,18393,2,168,62.0,110,80,1,1,0,0,1
1,20228,1,156,85.0,140,90,3,1,0,0,1
2,18857,1,165,64.0,130,70,3,1,0,0,0
3,17623,2,169,82.0,150,100,1,1,0,0,1
4,17474,1,156,56.0,100,60,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
69995,19240,2,168,76.0,120,80,1,1,1,0,1
69996,22601,1,158,126.0,140,90,2,2,0,0,1
69997,19066,2,183,105.0,180,90,3,1,0,1,0
69998,22431,1,163,72.0,135,80,1,2,0,0,0


In [None]:
y

Unnamed: 0,cardio
0,0
1,1
2,1
3,1
4,0
...,...
69995,0
69996,1
69997,1
69998,1


### Split data for training and testing

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

#### Here we split dataset for training & testing
#### 80% for training
#### 20% for testing
#### stratify=y ensures class balance (useful for classification datasets).

### Build full Pipeline

In [None]:
model = GradientBoostingClassifier(random_state=42)
pipeline =Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', GradientBoostingClassifier())
])
print("Model pipeline Created!")

Model pipeline Created!


####  In the above code defines a Gradient Boosting Classifier as the prediction model.The Pipeline chains preprocessing + model steps together, ensuring consistent processing for both training and prediction.

### Train the Pipeline

In [None]:
pipeline.fit(X_train,y_train)
print("\nModel training completed")


Model training completed


#### Fit the preprocessor (handles missing data, scaling).Trains the model on clean, transformed data.

### Make predictions

In [None]:
y_pred = pipeline.predict(X_test)
print("Predictions Completed!")

Predictions Completed!


### Evaluate the model

In [None]:
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.77      0.74      7004
           1       0.75      0.70      0.72      6996

    accuracy                           0.73     14000
   macro avg       0.73      0.73      0.73     14000
weighted avg       0.73      0.73      0.73     14000



#### pipeline.predict() runs both preprocessing and prediction on test data.
#### classification_report gives:
  #### Precision (accuracy of positive predictions)
  #### Recall (coverage of positive samples)
  #### F1-score (balance of precision & recall)
  #### Accuracy

### Save transformed data to csv

In [None]:

# Transform full dataset
full_transformed = preprocessor.fit_transform(df)

# Convert to DataFrame (Warning: OneHotEncoder expands columns)
full_df = pd.DataFrame(full_transformed.toarray()
                       if hasattr(full_transformed, "toarray")
                       else full_transformed)

full_df.to_csv("transformed_heart_data.csv", index=False)

print("\nTransformed dataset saved as 'transformed_heart_data.csv'")



Transformed dataset saved as 'transformed_heart_data.csv'
