In [8]:
import os
os.getcwd()


'c:\\Users\\Rishitha\\Python1\\Day-13'

In [None]:
import pandas as pd
import numpy as np
import os

def transform_telco(input_path, output_path, display_rows=10):
    # 0Ô∏è‚É£ Check if input file exists
    if not os.path.exists(input_path):
        raise FileNotFoundError(
            f"‚ùå Input CSV not found at: {input_path}\n"
            f"‚Ñπ  Please download it from Kaggle and place it here."
        )
    
    print(f"üîç Loading dataset from: {input_path}")
    df = pd.read_csv(input_path)
    print(f"üìå Initial shape: {df.shape}")
    # 1Ô∏è‚É£ CLEANING TASKS
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"].replace(" ", np.nan), errors="coerce")
    for col in ["tenure", "MonthlyCharges", "TotalCharges"]:
        df[col].fillna(df[col].median(), inplace=True)
    for col in df.select_dtypes(include=["object"]).columns:
        df[col].fillna("Unknown", inplace=True)
    # 2Ô∏è‚É£ FEATURE ENGINEERING
    df["tenure_group"] = pd.cut(
        df["tenure"],
        bins=[0, 12, 36, 60, 1000],
        labels=["New", "Regular", "Loyal", "Champion"],
        right=True
    )
    df["monthly_charge_segment"] = pd.cut(
        df["MonthlyCharges"],
        bins=[0, 30, 70, 500],
        labels=["Low", "Medium", "High"]
    )
    df["has_internet_service"] = df["InternetService"].map({
        "DSL": 1,
        "Fiber optic": 1,
        "No": 0
    }).fillna(0).astype(int)
    df["is_multi_line_user"] = df["MultipleLines"].apply(lambda x: 1 if x == "Yes" else 0)
    df["contract_type_code"] = df["Contract"].map({
        "Month-to-month": 0,
        "One year": 1,
        "Two year": 2
    }).fillna(0).astype(int)
    # 3Ô∏è‚É£ DROP UNNEEDED COLUMNS
    df.drop(columns=["customerID", "gender"], inplace=True, errors="ignore")
    # 4Ô∏è‚É£ EXPORT TRANSFORMED DATA
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    df.to_csv(output_path, index=False)
    print(f"üéâ Transformation complete! Saved to: {output_path}")
    print(f"üìä Final shape: {df.shape}")

    # ============================================================
    # 5Ô∏è‚É£ DISPLAY DATAFRAME INFORMATION
    # ============================================================
    print("\nüñ• Showing the first few rows of the transformed dataset:")
    print(df.head(display_rows))
    
    print("\n‚Ñπ Dataset Info:")
    print(df.info())

    print("\nüìä Statistical Summary of Numeric Columns:")
    print(df.describe())

    print("\nüóÇ Column Names:")
    print(df.columns.tolist())

    return df


# ============================================================
# RUN SCRIPT
# ============================================================
if __name__ == "__main__":
    input_file = "data/raw/Telco-Customer-Churn.csv"      # <-- update path if needed
    output_file = "data/staged/telco_transformed.csv"
    
    try:
        transform_telco(input_file, output_file)
    except FileNotFoundError as e:
        print(e)

üîç Loading dataset from: data/raw/Telco-Customer-Churn.csv
üìå Initial shape: (7043, 21)
üéâ Transformation complete! Saved to: data/staged/telco_transformed.csv
üìä Final shape: (7043, 24)

üñ• Showing the first few rows of the transformed dataset:
   SeniorCitizen Partner Dependents  tenure PhoneService     MultipleLines  \
0              0     Yes         No       1           No  No phone service   
1              0      No         No      34          Yes                No   
2              0      No         No       2          Yes                No   
3              0      No         No      45           No  No phone service   
4              0      No         No       2          Yes                No   
5              0      No         No       8          Yes               Yes   
6              0      No        Yes      22          Yes               Yes   
7              0      No         No      10           No  No phone service   
8              0     Yes         No      2

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [2]:
from supabase import create_client
import pandas as pd, os
from dotenv import load_dotenv
load_dotenv()
supabase = create_client(os.getenv("SUPABASE_URL"), os.getenv("SUPABASE_KEY"))
data = supabase.table("titanic_data").select("*").execute()
df = pd.DataFrame(data.data)
df.head()
df.info()
df.isnull().sum()
df.describe()
df.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           891 non-null    int64  
 1   survived     891 non-null    int64  
 2   pclass       891 non-null    int64  
 3   sex          891 non-null    object 
 4   age          891 non-null    float64
 5   sibsp        891 non-null    int64  
 6   parch        891 non-null    int64  
 7   fare         891 non-null    float64
 8   embarked     891 non-null    object 
 9   class        891 non-null    object 
 10  who          891 non-null    object 
 11  deck         891 non-null    object 
 12  embark_town  889 non-null    object 
 13  alone        891 non-null    bool   
 14  family_size  891 non-null    int64  
 15  is_alone     891 non-null    bool   
 16  title        891 non-null    object 
dtypes: bool(2), float64(2), int64(6), object(7)
memory usage: 106.3+ KB


Index(['id', 'survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'deck', 'embark_town', 'alone',
       'family_size', 'is_alone', 'title'],
      dtype='object')