In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
import gc

In [2]:
chunk_paths = [
    "data/raw/orange_large_train.data.chunk1",
    "data/raw/orange_large_train.data.chunk2",
    "data/raw/orange_large_train.data.chunk3",
    "data/raw/orange_large_train.data.chunk4",
    "data/raw/orange_large_train.data.chunk5",
]

In [3]:
dfs = []
columns = None

for i, path in enumerate(chunk_paths):
    print(f"Loading {path} ...")
    
    if i == 0:
        # First chunk: has header row
        df_chunk = pd.read_csv(
            path,
            sep="\t",
            header=0,          # use first line as header
            low_memory=False,
        )
        columns = df_chunk.columns
    else:
        # Subsequent chunks
        df_chunk = pd.read_csv(
            path,
            sep="\t",
            header=None,       # treat all rows as data
            names=columns,     # reuse header from chunk1
            low_memory=False,
        )
    
    # Memory optimization
    # Convert all float columns to float32
    float_cols = df_chunk.select_dtypes(include=['float']).columns
    df_chunk[float_cols] = df_chunk[float_cols].astype(np.float32)
    
    # Convert all int columns to int32
    int_cols = df_chunk.select_dtypes(include=['integer']).columns
    df_chunk[int_cols] = df_chunk[int_cols].astype(np.int32)
    
    print(f"  shape: {df_chunk.shape}")
    dfs.append(df_chunk)
    
    # Force garbage collection
    gc.collect()

# Concatenate all chunks into a single DataFrame
print("Concatenating chunks...")
X = pd.concat(dfs, axis=0, ignore_index=True)

# Clear list to free memory
del dfs
gc.collect()

print("\nFull feature matrix shape:", X.shape)
X.head()

Loading data/raw/orange_large_train.data.chunk1 ...
  shape: (9999, 15000)
Loading data/raw/orange_large_train.data.chunk2 ...
  shape: (10000, 15000)
Loading data/raw/orange_large_train.data.chunk3 ...
  shape: (10000, 15000)
Loading data/raw/orange_large_train.data.chunk4 ...
  shape: (10000, 15000)
Loading data/raw/orange_large_train.data.chunk5 ...
  shape: (10001, 15000)
Concatenating chunks...

Full feature matrix shape: (50000, 15000)


Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var14991,Var14992,Var14993,Var14994,Var14995,Var14996,Var14997,Var14998,Var14999,Var15000
0,0,0,0,0,0,0,0.0,0,0,0,...,,,IPyde4c,,KttQ,NBRvrWWx0Z,,,,
1,0,0,0,0,0,0,0.0,0,0,0,...,,,tQAVcMq,,KttQ,NBRvrWWx0Z,,,,
2,0,0,0,0,6,0,0.0,0,0,0,...,,,mnKkkvG,,KttQ,NBRvrWWx0Z,,,,
3,0,0,0,0,0,0,0.0,0,0,0,...,,,ZF0bafC,,KttQ,NBRvrWWx0Z,,,,
4,0,0,0,0,0,0,0.0,0,0,0,...,,,z1miBkG,,Q8_a,,,,,


In [4]:
# Load churn labels
y_full = pd.read_csv(
    "data/raw/orange_large_train_churn.labels",
    header=None,
    names=["churn"],
)

# Keep only the first len(X) rows
y = y_full.iloc[:len(X)].copy()

# Convert 1/-1 to 1/0
y["churn"] = (y["churn"] == 1).astype(int)

print("X shape:", X.shape)
print("y shape:", y.shape)
print(y["churn"].value_counts())

X shape: (50000, 15000)
y shape: (50000, 1)
churn
0    46328
1     3672
Name: count, dtype: int64


In [5]:
# Shape
print("X shape:", X.shape)
print("y shape:", y.shape)

# Check churn distribution (imbalance)
print(y["churn"].value_counts())
print(y["churn"].value_counts(normalize=True))

# Check dtypes
X.dtypes.value_counts()

X shape: (50000, 15000)
y shape: (50000, 1)
churn
0    46328
1     3672
Name: count, dtype: int64
churn
0    0.92656
1    0.07344
Name: proportion, dtype: float64


int32      13414
float32     1339
object       189
float64       58
Name: count, dtype: int64

In [6]:
# Fraction of missing values per column
missing_frac = X.isna().mean()

print(missing_frac.describe())
missing_frac.sort_values(ascending=False).head(5)

count    15000.000000
mean         0.033478
std          0.173158
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
dtype: float64


Var4790     1.0
Var14775    1.0
Var14979    1.0
Var14764    1.0
Var14137    1.0
dtype: float64

In [7]:
# Columns with 100% missing
cols_all_nan = missing_frac[missing_frac == 1.0].index
print("All-NaN cols:", len(cols_all_nan))

# Columns with >=95% missing
cols_high_missing = missing_frac[missing_frac >= 0.95].index
print("Cols with >=95% missing:", len(cols_high_missing))

# Inspect a few
cols_high_missing[:10]

All-NaN cols: 109
Cols with >=95% missing: 393


Index(['Var52', 'Var156', 'Var217', 'Var269', 'Var360', 'Var598', 'Var609',
       'Var673', 'Var723', 'Var808'],
      dtype='object')

In [8]:
cols_missing = cols_high_missing  # includes the all-NaN ones
print("Dropping", len(cols_missing), "columns due to high missingness.")

X = X.drop(columns=cols_missing)
print("New X shape after missing-drop:", X.shape)

Dropping 393 columns due to high missingness.
New X shape after missing-drop: (50000, 14607)


In [9]:
# Count unique non-nan values per column
nunique = X.nunique(dropna=True)
print(nunique.describe())

# Columns with <= 1 unique non-NaN value
cols_constant = nunique[nunique <= 1].index
print("Constant (or almost constant) cols:", len(cols_constant))

X = X.drop(columns=cols_constant)
print("New X shape after dropping constant cols:", X.shape)

count    14607.000000
mean      1356.555829
std       6771.498869
min          1.000000
25%          2.000000
50%          3.000000
75%         21.000000
max      49855.000000
dtype: float64
Constant (or almost constant) cols: 1439
New X shape after dropping constant cols: (50000, 13168)


In [10]:
num_cols = X.select_dtypes(exclude="object").columns
cat_cols = X.select_dtypes(include="object").columns

print("Numeric cols:", len(num_cols))
print("Categorical cols:", len(cat_cols))

Numeric cols: 13051
Categorical cols: 117


In [11]:
# Summaries for numeric columns
X[num_cols].describe().T.head() # Commented out for speed

# How "wide" are your categorical columns?
X[cat_cols].nunique().describe() # Commented out for speed
X[cat_cols].nunique().sort_values(ascending=False).head(10)

Var14784    15415
Var14868    15415
Var14822    13990
Var14913     5713
Var14904     5073
Var14893     4291
Var14797     4291
Var14993     4291
Var14795     2016
Var14788      361
dtype: int64

In [12]:
print("Imputation & Encoding")

# Remove missing values
# categorical fill w/ missing
X[cat_cols] = X[cat_cols].fillna("MISSING")
print("Categorical missing values filled.")
# numeric fill w/ -1
X[num_cols] = X[num_cols].fillna(-1)
print("Missing values filled.")

# Encode categorical columns to integers
# Ordinal Encoding
encoder = OrdinalEncoder(dtype=np.int32, handle_unknown='use_encoded_value', unknown_value=-1)
X[cat_cols] = encoder.fit_transform(X[cat_cols])
print("Categorical columns encoded to integers.")

# Verify
print("\nNew Shape:", X.shape)
print("Dtypes breakdown:")
print(X.dtypes.value_counts())

Imputation & Encoding
Categorical missing values filled.
Missing values filled.
Categorical columns encoded to integers.

New Shape: (50000, 13168)
Dtypes breakdown:
int32      12111
float32      999
float64       58
Name: count, dtype: int64


In [13]:
# sanity check for -1 in numeric columns
min_val = X[num_cols].min().min()
print(f"Minimum value in numeric columns: {min_val}")

if min_val < 0:
    print("Some numeric columns have negative values")
else:
    print("Safe to use -1 for missing values")

Minimum value in numeric columns: -46407868416.0
Some numeric columns have negative values


In [14]:
# Duplicate rows in X
dup_rows = X.duplicated().sum()
print("Duplicate feature rows:", dup_rows)

# Duplicate x,y pairs
dup_full = pd.concat([X, y.reset_index(drop=True)], axis=1).duplicated().sum()
print("Duplicate rows including churn label:", dup_full)

Duplicate feature rows: 0
Duplicate rows including churn label: 0


In [15]:
print("Final Checks")

# verify all data is numeric
non_numeric = X.select_dtypes(exclude=[np.number]).columns
if len(non_numeric) > 0:
    print(f"Found {len(non_numeric)} non-numeric columns: {list(non_numeric)}")
else:
    print("All columns are numeric.")

# check for any remaining missing or infinite values
n_nan = X.isna().sum().sum()
n_inf = np.isinf(X).values.sum()
if n_nan > 0:
    print(f"Found {n_nan} remaining missing values!")
else:
    print("No missing values remaining.")
    
if n_inf > 0:
    print(f"Found {n_inf} infinite values!")
else:
    print("No infinite values.")

# alignment check
if len(X) != len(y):
    print(f"X and y have different lengths! X:{len(X)}, y:{len(y)}")
else:
    print(f"Dimensions match: {len(X)} rows.")
    
# class balance reminder
print("\nTarget Distribution:")
print(y['churn'].value_counts(normalize=True))

Final Checks
All columns are numeric.
No missing values remaining.
No infinite values.
Dimensions match: 50000 rows.

Target Distribution:
churn
0    0.92656
1    0.07344
Name: proportion, dtype: float64


In [16]:
X.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var14974,Var14975,Var14980,Var14983,Var14989,Var14990,Var14993,Var14995,Var14996,Var14998
0,0,0,0,0,0,0,0.0,0,0,0,...,3,2,1,3,2,0,1263,0,1,34
1,0,0,0,0,0,0,0.0,0,0,0,...,13,3,3,3,2,12,3880,0,1,34
2,0,0,0,0,6,0,0.0,0,0,0,...,3,3,1,3,2,5,3379,0,1,34
3,0,0,0,0,0,0,0.0,0,0,0,...,3,3,1,0,2,7,2332,0,1,34
4,0,0,0,0,0,0,0.0,0,0,0,...,3,3,1,0,2,0,4228,2,0,34


In [19]:
df_clean = X.copy()

# Assign labels
if len(y) != len(df_clean):
    print(f"Label count ({len(y)}) does not match Data count ({len(df_clean)})!")
    # Fallback (should not happen if files are correct)
    df_clean["churn"] = np.nan
    df_clean.iloc[:len(y), df_clean.columns.get_loc("churn")] = y["churn"].values
else:
    df_clean["churn"] = y["churn"].values
    print("Labels assigned to all rows")

# Split into Local Train and Local Test using Random Stratified Split
print("Performing Random Stratified Split (80% Train, 20% Test)...")

df_train, df_test = train_test_split(
    df_clean, 
    test_size=0.2, 
    random_state=42, 
    stratify=df_clean['churn']
)

print(f"Train shape: {df_train.shape}")
print(f"Test shape:  {df_test.shape}")

df_train.to_parquet("data/clean/orange_large_train_clean.parquet", index=False)
df_test.to_parquet("data/clean/orange_large_test_clean.parquet", index=False)

print("Saved 'data/clean/orange_large_train_clean.parquet'")
print("Saved 'data/clean/orange_large_test_clean.parquet'")

Labels assigned to all rows
Performing Random Stratified Split (80% Train, 20% Test)...
Train shape: (40000, 13169)
Test shape:  (10000, 13169)
Saved 'data/clean/orange_large_train_clean.parquet'
Saved 'data/clean/orange_large_test_clean.parquet'


In [20]:
df_clean.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var14975,Var14980,Var14983,Var14989,Var14990,Var14993,Var14995,Var14996,Var14998,churn
0,0,0,0,0,0,0,0.0,0,0,0,...,2,1,3,2,0,1263,0,1,34,0
1,0,0,0,0,0,0,0.0,0,0,0,...,3,3,3,2,12,3880,0,1,34,0
2,0,0,0,0,6,0,0.0,0,0,0,...,3,1,3,2,5,3379,0,1,34,0
3,0,0,0,0,0,0,0.0,0,0,0,...,3,1,0,2,7,2332,0,1,34,0
4,0,0,0,0,0,0,0.0,0,0,0,...,3,1,0,2,0,4228,2,0,34,1
