In [1]:
import os
import pandas as pd
import zipfile
from tqdm import tqdm  # Progress bar

# ✅ Define Paths
zip_file_path = r"C:\Users\palak\Desktop\sem6\Intel_unaati\EdNet-KT4.zip"
output_file = r"C:\Users\palak\Desktop\sem6\Intel_unaati\zipped_processed_student_data_sampled.csv"

# ✅ Initialize an empty DataFrame
combined_df = pd.DataFrame()

# ✅ Open ZIP file without extracting
with zipfile.ZipFile(zip_file_path, 'r') as z:
    file_list = [f for f in z.namelist() if f.endswith('.csv')]  # Filter only CSV files

    print(f"✅ Found {len(file_list)} CSV files. Processing 15% of them...")

    # Sample 25% of the files
    sample_files = pd.Series(file_list).sample(frac=0.15, random_state=42).tolist()

    for file in tqdm(sample_files, desc="Processing Files"):
        try:
            # Read CSV directly from ZIP without extracting
            with z.open(file) as f:
                df = pd.read_csv(f)

                # Convert timestamp column if it exists
                if 'timestamp' in df.columns:
                    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')

                # Append to the combined dataframe
                combined_df = pd.concat([combined_df, df], ignore_index=True)

        except Exception as e:
            print(f"⚠️ Skipping file {file} due to error: {e}")

# ✅ Save the combined dataset
combined_df.to_csv(output_file, index=False)

print(f"✅ Data Processing Complete! Saved to: {output_file}")


✅ Found 297915 CSV files. Processing 15% of them...


Processing Files: 100%|██████████| 44687/44687 [6:14:13<00:00,  1.99it/s]      


✅ Data Processing Complete! Saved to: C:\Users\palak\Desktop\sem6\Intel_unaati\zipped_processed_student_data_sampled.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt


In [3]:
# ✅ Load Processed Data
file_path = r"C:\Users\palak\Desktop\sem6\Intel_unaati\zipped_processed_student_data_sampled.csv"
df = pd.read_csv(file_path)

# ✅ Display Data Info
print("✅ Data Loaded Successfully!")
print("\n📊 Data Info:\n", df.info())
print("\n🔍 Sample Data:\n", df.head())

print("\n❌ Missing Values:\n", df.isnull().sum())


✅ Data Loaded Successfully!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19077465 entries, 0 to 19077464
Data columns (total 7 columns):
 #   Column       Dtype  
---  ------       -----  
 0   timestamp    object 
 1   action_type  object 
 2   item_id      object 
 3   cursor_time  float64
 4   source       object 
 5   user_answer  object 
 6   platform     object 
dtypes: float64(1), object(6)
memory usage: 1018.8+ MB

📊 Data Info:
 None

🔍 Sample Data:
                  timestamp action_type item_id  cursor_time     source  \
0  2018-09-14 01:36:22.395       enter   b5060          NaN  diagnosis   
1  2018-09-14 01:36:55.909     respond   q6528          NaN  diagnosis   
2  2018-09-14 01:36:56.645      submit   b5060          NaN  diagnosis   
3  2018-09-14 01:36:58.168       enter   b3779          NaN  diagnosis   
4  2018-09-14 01:37:21.381     respond   q5247          NaN  diagnosis   

  user_answer platform  
0         NaN      web  
1           d      web  
2         Na

In [4]:
# Fill missing values for numerical columns
df.fillna(method='ffill', inplace=True)  # Forward fill


In [5]:
# Convert timestamp to datetime and extract features
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour'] = df['timestamp'].dt.hour  # Extract hour
df['day'] = df['timestamp'].dt.day  # Extract day
df['month'] = df['timestamp'].dt.month  # Extract month


In [6]:
# One-hot encode categorical features
df = pd.get_dummies(df, columns=['action_type', 'source', 'platform'], drop_first=True)


Train LSTM for Student Engagement Analysis

In [12]:
import pandas as pd

# Load the dataset
file_path = r"C:\Users\palak\Desktop\sem6\Intel_unaati\zipped_processed_student_data_sampled.csv"
df = pd.read_csv(file_path)

# Display first few rows to understand its structure
print(df.head())

# Display column names to ensure all features are present
print(df.columns)


                 timestamp action_type item_id  cursor_time     source  \
0  2018-09-14 01:36:22.395       enter   b5060          NaN  diagnosis   
1  2018-09-14 01:36:55.909     respond   q6528          NaN  diagnosis   
2  2018-09-14 01:36:56.645      submit   b5060          NaN  diagnosis   
3  2018-09-14 01:36:58.168       enter   b3779          NaN  diagnosis   
4  2018-09-14 01:37:21.381     respond   q5247          NaN  diagnosis   

  user_answer platform  
0         NaN      web  
1           d      web  
2         NaN      web  
3         NaN      web  
4           b      web  
Index(['timestamp', 'action_type', 'item_id', 'cursor_time', 'source',
       'user_answer', 'platform'],
      dtype='object')


In [17]:
# Check the actual columns in the DataFrame
print(df.columns)


Index(['action_type', 'item_id', 'cursor_time', 'source', 'user_answer',
       'platform', 'hour', 'day', 'month'],
      dtype='object')


In [18]:
import pandas as pd

# Load the dataset
file_path = r"C:\Users\palak\Desktop\sem6\Intel_unaati\zipped_processed_student_data_sampled.csv"
df = pd.read_csv(file_path)

# Convert 'timestamp' column to datetime and extract features
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour'] = df['timestamp'].dt.hour
df['day'] = df['timestamp'].dt.day
df['month'] = df['timestamp'].dt.month

# Drop 'timestamp' column as it's no longer needed
df.drop(columns=['timestamp'], inplace=True)

# One-hot encode categorical features (action_type, source, platform)
df = pd.get_dummies(df, columns=['action_type', 'source', 'platform'], drop_first=True)

# Fill missing values by forward filling
df.fillna(method='ffill', inplace=True)

# Check the data after preprocessing
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19077465 entries, 0 to 19077464
Data columns (total 26 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   item_id                        object 
 1   cursor_time                    float64
 2   user_answer                    object 
 3   hour                           int32  
 4   day                            int32  
 5   month                          int32  
 6   action_type_enter              bool   
 7   action_type_erase_choice       bool   
 8   action_type_pause_audio        bool   
 9   action_type_pause_video        bool   
 10  action_type_pay                bool   
 11  action_type_play_audio         bool   
 12  action_type_play_video         bool   
 13  action_type_quit               bool   
 14  action_type_refund             bool   
 15  action_type_respond            bool   
 16  action_type_submit             bool   
 17  action_type_undo_erase_choice  bool   
 18  

In [21]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

# Define the chunk size (adjust as needed)
chunk_size = 10000  # You can reduce this if memory is still an issue

# Initialize a list to store processed chunks
chunks = []

# Initialize LabelEncoder for user_answer
label_encoder = LabelEncoder()

# Create a MinMaxScaler instance
scaler = MinMaxScaler()

# Iterate over the CSV file in chunks
for chunk in pd.read_csv(r"C:\Users\palak\Desktop\sem6\Intel_unaati\zipped_processed_student_data_sampled.csv", 
                         chunksize=chunk_size):
    
    # Process each chunk: Drop 'item_id' column
    chunk.drop(columns=['item_id'], inplace=True)
    
    # Convert 'user_answer' to numeric labels
    chunk['user_answer'] = label_encoder.fit_transform(chunk['user_answer'])
    
    # Extract time-related features from 'timestamp'
    chunk['timestamp'] = pd.to_datetime(chunk['timestamp'])
    chunk['hour'] = chunk['timestamp'].dt.hour
    chunk['day'] = chunk['timestamp'].dt.day
    chunk['month'] = chunk['timestamp'].dt.month
    
    # Drop 'timestamp' as it's no longer needed
    chunk.drop(columns=['timestamp'], inplace=True)
    
    # One-hot encode categorical features
    chunk = pd.get_dummies(chunk, columns=['action_type', 'source', 'platform'], drop_first=True)
    
    # Fill missing values by forward filling
    chunk.fillna(method='ffill', inplace=True)
    
    # Scale the features
    X = chunk.drop(columns=['user_answer'])
    y = chunk['user_answer']
    X_scaled = scaler.fit_transform(X)  # Scale the chunk

    # Add the scaled chunk to the list
    chunks.append((X_scaled, y))

# Combine all the chunks
X_combined = pd.concat([pd.DataFrame(chunk[0]) for chunk in chunks], axis=0)
y_combined = pd.concat([pd.Series(chunk[1]) for chunk in chunks], axis=0)

# Reshape for LSTM (samples, time_steps, features)
X_lstm = X_combined.values.reshape((X_combined.shape[0], 1, X_combined.shape[1]))  # Adding a time step dimension

# Split into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_lstm, y_combined, test_size=0.2, random_state=42)

# Check the shape of the data
print(X_train.shape, X_test.shape)


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


(15261972, 1, 24) (3815493, 1, 24)


In [22]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

# Define chunk size for memory efficiency
chunk_size = 100000

# Initialize a list to store processed chunks
chunks = []

# Initialize LabelEncoder and MinMaxScaler
label_encoder = LabelEncoder()
scaler = MinMaxScaler()

# Iterate over the CSV file in chunks
for chunk in pd.read_csv(r"C:\Users\palak\Desktop\sem6\Intel_unaati\zipped_processed_student_data_sampled.csv", 
                         chunksize=chunk_size):
    
    # Drop 'item_id' column as it's not needed for training
    chunk.drop(columns=['item_id'], inplace=True)
    
    # Convert 'user_answer' to numeric labels
    chunk['user_answer'] = label_encoder.fit_transform(chunk['user_answer'])
    
    # Extract time-related features from 'timestamp'
    chunk['timestamp'] = pd.to_datetime(chunk['timestamp'])
    chunk['hour'] = chunk['timestamp'].dt.hour
    chunk['day'] = chunk['timestamp'].dt.day
    chunk['month'] = chunk['timestamp'].dt.month
    
    # Drop 'timestamp' as it's no longer needed
    chunk.drop(columns=['timestamp'], inplace=True)
    
    # One-hot encode categorical features
    chunk = pd.get_dummies(chunk, columns=['action_type', 'source', 'platform'], drop_first=True)
    
    # Fill missing values by forward filling
    chunk.fillna(method='ffill', inplace=True)
    
    # Remove columns that are completely NaN (if any)
    chunk = chunk.dropna(axis=1, how='all')
    
    # Scale the features
    X = chunk.drop(columns=['user_answer'])
    y = chunk['user_answer']
    X_scaled = scaler.fit_transform(X)  # Scale the chunk

    # Add the scaled chunk to the list
    chunks.append((X_scaled, y))

# Combine all the chunks
X_combined = pd.concat([pd.DataFrame(chunk[0]) for chunk in chunks], axis=0)
y_combined = pd.concat([pd.Series(chunk[1]) for chunk in chunks], axis=0)

# Reshape for LSTM (samples, time_steps, features)
X_lstm = X_combined.values.reshape((X_combined.shape[0], 1, X_combined.shape[1]))  # Adding a time step dimension

# Split into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_lstm, y_combined, test_size=0.2, random_state=42)

# Check the shape of the data
print(X_train.shape, X_test.shape)


(15261972, 1, 24) (3815493, 1, 24)


In [24]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Build LSTM model
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),  # Shape (samples, time_steps, features)
    Dropout(0.2),  # Dropout to prevent overfitting
    LSTM(32),
    Dropout(0.2),
    Dense(3, activation='softmax')  # 3 classes: 'user_answer_b', 'user_answer_c', 'user_answer_d'
])

# Print model summary
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 1, 64)             22784     
                                                                 
 dropout_2 (Dropout)         (None, 1, 64)             0         
                                                                 
 lstm_3 (LSTM)               (None, 32)                12416     
                                                                 
 dropout_3 (Dropout)         (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 3)                 99        
                                                                 
Total params: 35299 (137.89 KB)
Trainable params: 35299 (137.89 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [25]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [27]:
# Check the unique values in 'user_answer'
print("Unique values in 'user_answer':", y_combined.unique())


Unique values in 'user_answer': [4 3 1 0 2]


In [29]:
from sklearn.preprocessing import LabelEncoder

# Re-encode 'user_answer' to correctly reflect 5 classes
label_encoder = LabelEncoder()
y_combined = label_encoder.fit_transform(y_combined)

import numpy as np

# Check the unique values after re-encoding
print("Unique values after re-encoding:", np.unique(y_combined))


Unique values after re-encoding: [0 1 2 3 4]


In [32]:
# Rebuild the LSTM model with correct number of output units
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),  # Shape (samples, time_steps, features)
    Dropout(0.2),  # Dropout to prevent overfitting
    LSTM(32),
    Dropout(0.2),
    Dense(5, activation='softmax')  # Update to 5 classes
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print model summary to confirm changes
model.summary()


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 1, 64)             22784     
                                                                 
 dropout_4 (Dropout)         (None, 1, 64)             0         
                                                                 
 lstm_5 (LSTM)               (None, 32)                12416     
                                                                 
 dropout_5 (Dropout)         (None, 32)                0         
                                                                 
 dense_2 (Dense)             (None, 5)                 165       
                                                                 
Total params: 35365 (138.14 KB)
Trainable params: 35365 (138.14 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [34]:
# Compile the model with a lower learning rate
model.compile(optimizer=Adam(learning_rate=0.0001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [35]:
# Check if there are any NaN or infinite values in the dataset
import numpy as np

print("Checking for NaN or Inf in X_train:")
print(np.isnan(X_train).sum(), np.isinf(X_train).sum())

print("Checking for NaN or Inf in y_train:")
print(np.isnan(y_train).sum(), np.isinf(y_train).sum())


Checking for NaN or Inf in X_train:
16384934 0
Checking for NaN or Inf in y_train:
0 0


In [41]:
import numpy as np

# Define chunk size (adjust based on available memory)
chunk_size = 100000  # Adjust the chunk size

# Initialize empty lists to store processed chunks
X_train_chunks = []
X_test_chunks = []

# Process X_train in chunks
for start in range(0, X_train.shape[0], chunk_size):
    end = min(start + chunk_size, X_train.shape[0])
    
    chunk = X_train[start:end]
    
    # Fill NaN values by replacing with the mean of the feature (axis=0)
    for i in range(chunk.shape[2]):  # Iterate over features (columns)
        col_mean = np.nanmean(chunk[:, :, i], axis=0)  # Compute mean of each feature
        chunk[:, :, i] = np.nan_to_num(chunk[:, :, i], nan=col_mean)

    # Append the processed chunk
    X_train_chunks.append(chunk)

# Process X_test in chunks (same approach)
for start in range(0, X_test.shape[0], chunk_size):
    end = min(start + chunk_size, X_test.shape[0])
    
    chunk = X_test[start:end]
    
    # Fill NaN values by replacing with the mean of the feature (axis=0)
    for i in range(chunk.shape[2]):  # Iterate over features (columns)
        col_mean = np.nanmean(chunk[:, :, i], axis=0)  # Compute mean of each feature
        chunk[:, :, i] = np.nan_to_num(chunk[:, :, i], nan=col_mean)

    # Append the processed chunk
    X_test_chunks.append(chunk)

# Concatenate all the chunks back together
X_train = np.vstack(X_train_chunks)
X_test = np.vstack(X_test_chunks)

# Check if there are any NaN or Inf values left
print("Checking for NaN in X_train after filling:")
print(np.isnan(X_train).sum(), np.isinf(X_train).sum())

print("Checking for NaN in X_test after filling:")
print(np.isnan(X_test).sum(), np.isinf(X_test).sum())


Checking for NaN in X_train after filling:
0 0
Checking for NaN in X_test after filling:
0 0


In [46]:
# Check the current shape of X_train and X_test
print("X_train shape before reshaping:", X_train.shape)  # Expected: (15261972, 24)
print("X_test shape before reshaping:", X_test.shape)    # Expected: (3815493, 24)


X_train shape before reshaping: (15261972, 1, 24)
X_test shape before reshaping: (3815493, 1, 24)


In [47]:
# Train the model directly without reshaping
history = model.fit(
    X_train, y_train,
    epochs=10,  # Adjust number of epochs based on performance
    batch_size=32,  # Adjust batch size if needed
    validation_data=(X_test, y_test),
    verbose=1  # Set to 1 to print training progress
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
# Convert 'timestamp' column to UNIX timestamp (seconds since epoch)
if 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['timestamp'] = df['timestamp'].view('int64') // 10**9  # Convert to seconds since epoch


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# 1. Load the dataset
file_path = r"C:\Users\palak\Desktop\sem6\Intel_unaati\zipped_processed_student_data_sampled.csv"
df = pd.read_csv(file_path)

# 2. Convert 'timestamp' column to UNIX timestamp (if exists)
if 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['timestamp'] = df['timestamp'].view('int64') // 10**9  # Convert to seconds since epoch

# 3. One-hot encode categorical columns (if needed)
df = pd.get_dummies(df, columns=['action_type', 'platform'], drop_first=True)

# 4. Split the data into features (X) and target (y)
X = df.drop(columns=['user_answer'])  # Replace 'user_answer' with your actual target column name
y = df['user_answer']  # Replace 'user_answer' with your actual target column name

# 5. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Normalize the data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 7. Reshape for LSTM (samples, time steps, features)
X_train_lstm = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_lstm = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

# 8. Define the LSTM model
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(1, X_train_lstm.shape[2])),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(1, activation='sigmoid')  # Binary Classification (0 or 1)
])

# 9. Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# 10. Train the model
history = model.fit(
    X_train_lstm, y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_test_lstm, y_test),
    verbose=1
)

# 11. Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test_lstm, y_test, verbose=1)

# 12. Print the results
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")


ValueError: could not convert string to float: 'q7257'