In [4]:
import pandas as pd
import json

# 1. Load the Training Data (train.json)
with open('train.json', 'r') as f:
    train_data_dict = json.load(f)

# Convert the dictionary of articles to a DataFrame, and then TRANSPOSE it (using .T)
train_df = pd.DataFrame.from_dict(train_data_dict, orient='index')
train_df.index.name = 'article_id' # Name the index column for clarity
train_df = train_df.reset_index()   # Convert index to a regular column
print("Training Data Loaded:", train_df.shape) 

# 2. Load the Test Data (test.json)
with open('test.json', 'r') as f:
    test_data_dict = json.load(f)

# Convert the dictionary of articles to a DataFrame, and then TRANSPOSE it
test_df = pd.DataFrame.from_dict(test_data_dict, orient='index')
test_df.index.name = 'article_id' 
test_df = test_df.reset_index()
print("Test Data Loaded:", test_df.shape) 

# 3. Load the Sample Submission File (no change needed)
sample_submission_df = pd.read_csv('sample_submission.csv')
print("Sample Submission Loaded:", sample_submission_df.shape) 

# Display the head to verify the column names and shape are correct
print("\nFinal Corrected Train Data Head:")
print(train_df.head())

Training Data Loaded: (25000, 6)
Test Data Loaded: (2000, 7)
Sample Submission Loaded: (2000, 2)

Final Corrected Train Data Head:
  article_id      id article_venue  \
0     112062  112062                 
1     102168  102168             0   
2     111019  111019             1   
3     119259  119259             2   
4     109653  109653             3   

                                                text  article_year  \
0  [64, 1, 322, 134, 136, 396, 270, 144, 476, 481...            17   
1  [258, 260, 389, 261, 390, 396, 400, 17, 146, 2...            13   
2  [320, 454, 266, 462, 17, 339, 404, 342, 407, 2...             7   
3  [260, 132, 333, 15, 400, 272, 146, 401, 278, 3...            13   
4  [64, 385, 449, 450, 71, 73, 268, 80, 216, 25, ...             9   

   contributor  
0  [1605, 759]  
1       [2182]  
2       [2176]  
3       [1107]  
4       [1414]  


In [5]:
import pandas as pd
import numpy as np

# Assuming 'train_df' is your correctly loaded DataFrame (25000 rows)

# --- A. Generate Positive Samples (Y=1) ---

# 1. 'Explode' the DataFrame to create a row for every item in the 'contributor' list.
# This results in a new DataFrame where each row is an article-contributor pair.
positive_samples_df = train_df.explode('contributor').copy()

# 2. Rename the new column to 'candidate' to match the test set structure.
positive_samples_df.rename(columns={'contributor': 'candidate'}, inplace=True)

# 3. Add the target label 'is_contributor' and set it to 1.
positive_samples_df['is_contributor'] = 1

# 4. Convert 'candidate' to integer type (as the IDs are numerical).
# Note: Use pd.to_numeric() to handle any potential conversion errors gracefully.
positive_samples_df['candidate'] = pd.to_numeric(positive_samples_df['candidate'], errors='coerce').astype('Int64')

# Keep only the necessary columns for training
positive_samples_df = positive_samples_df[['article_id', 'article_venue', 'text', 'article_year', 'candidate', 'is_contributor']]

# Display the size and head to verify the operation
print(f"Total positive samples generated: {positive_samples_df.shape[0]}")
print("\nPositive Samples Head (Y=1):")
print(positive_samples_df.head())

Total positive samples generated: 44170

Positive Samples Head (Y=1):
  article_id article_venue                                               text  \
0     112062                [64, 1, 322, 134, 136, 396, 270, 144, 476, 481...   
0     112062                [64, 1, 322, 134, 136, 396, 270, 144, 476, 481...   
1     102168             0  [258, 260, 389, 261, 390, 396, 400, 17, 146, 2...   
2     111019             1  [320, 454, 266, 462, 17, 339, 404, 342, 407, 2...   
3     119259             2  [260, 132, 333, 15, 400, 272, 146, 401, 278, 3...   

   article_year  candidate  is_contributor  
0            17       1605               1  
0            17        759               1  
1            13       2182               1  
2             7       2176               1  
3            13       1107               1  


In [6]:
import pandas as pd
import numpy as np

# Assuming 'train_df' is your correctly loaded DataFrame (25000 rows)

# --- A. Generate Positive Samples (Y=1) ---

# 1. 'Explode' the DataFrame to create a row for every item in the 'contributor' list.
# This results in a new DataFrame where each row is an article-contributor pair.
positive_samples_df = train_df.explode('contributor').copy()

# 2. Rename the new column to 'candidate' to match the test set structure.
positive_samples_df.rename(columns={'contributor': 'candidate'}, inplace=True)

# 3. Add the target label 'is_contributor' and set it to 1.
positive_samples_df['is_contributor'] = 1

# 4. Convert 'candidate' to integer type (as the IDs are numerical).
# Note: Use pd.to_numeric() to handle any potential conversion errors gracefully.
positive_samples_df['candidate'] = pd.to_numeric(positive_samples_df['candidate'], errors='coerce').astype('Int64')

# Keep only the necessary columns for training
positive_samples_df = positive_samples_df[['article_id', 'article_venue', 'text', 'article_year', 'candidate', 'is_contributor']]

# Display the size and head to verify the operation
print(f"Total positive samples generated: {positive_samples_df.shape[0]}")
print("\nPositive Samples Head (Y=1):")
print(positive_samples_df.head())

Total positive samples generated: 44170

Positive Samples Head (Y=1):
  article_id article_venue                                               text  \
0     112062                [64, 1, 322, 134, 136, 396, 270, 144, 476, 481...   
0     112062                [64, 1, 322, 134, 136, 396, 270, 144, 476, 481...   
1     102168             0  [258, 260, 389, 261, 390, 396, 400, 17, 146, 2...   
2     111019             1  [320, 454, 266, 462, 17, 339, 404, 342, 407, 2...   
3     119259             2  [260, 132, 333, 15, 400, 272, 146, 401, 278, 3...   

   article_year  candidate  is_contributor  
0            17       1605               1  
0            17        759               1  
1            13       2182               1  
2             7       2176               1  
3            13       1107               1  


In [7]:
import pandas as pd
import numpy as np

# --- B. Generate Negative Samples (Y=0) ---

# 1. Get the set of all unique contributor IDs from the training data.
# This set will be used for sampling.
all_contributor_ids = train_df['contributor'].explode().unique()
num_contributor_ids = len(all_contributor_ids)
print(f"Total Unique Contributor IDs: {num_contributor_ids}")

# 2. Determine the desired number of negative samples (match the positive count).
num_negative_samples = positive_samples_df.shape[0]

# 3. Create a list to store the negative samples
negative_samples_list = []
np.random.seed(42) # Set seed for reproducibility

# We sample negative pairs article by article
for index, row in train_df.iterrows():
    article_id = row['article_id']
    known_contributors = set(row['contributor'])

    # Determine how many negative samples to generate for this article
    # We aim for roughly the same ratio as positive samples per article
    # For simplicity and speed, we will aim for an average match.
    # A more rigorous method would ensure a 1:1 ratio across the whole dataset.
    num_pos_for_article = len(known_contributors)
    
    # Randomly sample non-contributor IDs
    # Keep sampling until we get enough unique non-contributors
    sampled_candidates = []
    
    # Use a faster, vectorized approach for sampling
    # Sample from all IDs, then filter out known contributors
    if num_pos_for_article > 0:
        
        # We sample a large pool to ensure we find enough non-contributors
        sample_pool_size = num_pos_for_article * 5 
        
        while len(sampled_candidates) < num_pos_for_article:
            # Sample random IDs from the total pool
            potential_neg_ids = np.random.choice(all_contributor_ids, size=sample_pool_size, replace=True)
            
            # Filter out the actual contributors for this article
            filtered_neg_ids = [cid for cid in potential_neg_ids if cid not in known_contributors]
            
            # Add to the candidates list, limiting to the required number
            needed = num_pos_for_article - len(sampled_candidates)
            sampled_candidates.extend(filtered_neg_ids[:needed])

        # Create the DataFrame rows for the negative samples of this article
        for candidate_id in sampled_candidates:
            negative_samples_list.append({
                'article_id': article_id,
                'article_venue': row['article_venue'],
                'text': row['text'],
                'article_year': row['article_year'],
                'candidate': candidate_id,
                'is_contributor': 0
            })

# 4. Convert the list into a DataFrame
negative_samples_df = pd.DataFrame(negative_samples_list)
print(f"Total negative samples generated: {negative_samples_df.shape[0]}")

# --- C. Combine and Finalize Training Data ---

# 5. Concatenate positive and negative samples
train_pairs_df = pd.concat([positive_samples_df, negative_samples_df], ignore_index=True)

# 6. Randomly shuffle the final training set
train_pairs_df = train_pairs_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\nFinal Training Pairs Dataset Shape: {train_pairs_df.shape}")
print(f"Final Class Balance Check (Y=1): {train_pairs_df['is_contributor'].mean():.4f}")

# Clean up memory by deleting temporary DataFrames
del positive_samples_df
del negative_samples_df

Total Unique Contributor IDs: 2302
Total negative samples generated: 44170

Final Training Pairs Dataset Shape: (88340, 6)
Final Class Balance Check (Y=1): 0.5000


In [8]:
from sklearn.model_selection import train_test_split

# Separate features (X) from the target (Y)
X = train_pairs_df.drop('is_contributor', axis=1)
Y = train_pairs_df['is_contributor']

# Perform the 80/20 split
X_train, X_val, Y_train, Y_val = train_test_split(
    X, Y, 
    test_size=0.2, 
    random_state=42, 
    stratify=Y # Crucial: ensures the 50/50 balance is preserved in both splits
)

print(f"Training Set Shape: {X_train.shape} (Features) / {Y_train.shape} (Labels)")
print(f"Validation Set Shape: {X_val.shape} (Features) / {Y_val.shape} (Labels)")

Training Set Shape: (70672, 5) (Features) / (70672,) (Labels)
Validation Set Shape: (17668, 5) (Features) / (17668,) (Labels)


In [9]:
import pandas as pd

# Assume 'train_df' is your correctly loaded 25000-row DataFrame.

# --- 1. Preparation: Get article-level co-author counts (Vectorized) ---
# Calculate: (Total contributors in the article) - 1
article_co_author_counts = train_df.set_index('article_id')['contributor'].apply(lambda x: len(x) - 1)
article_co_author_counts.name = 'co_authors_per_article'

# --- 2. Flatten the Training Data to get the 44,170 article-contributor records ---
exploded_df = train_df[['article_id', 'article_year', 'contributor']].explode('contributor').copy()

# --- 3. Map Co-author Count back to every contributor record (Vectorized) ---
exploded_df['co_authors_per_article'] = exploded_df['article_id'].map(article_co_author_counts)

# --- 4. Group by Contributor ID and Aggregate (Efficient Aggregation) ---
# This step calculates features that are INDEPENDENT of chronological order
contributor_profiles = exploded_df.groupby('contributor').agg(
    # Total Articles (Numerical Feature)
    total_articles=('article_id', 'nunique'),
    
    # Average Co-authorship (Numerical Feature)
    avg_co_authors=('co_authors_per_article', 'mean'),
    
    # List of unique years they published in (Categorical Feature)
    participated_years=('article_year', lambda x: x.unique().tolist())
)

# --- 5. Final Calculations and Formatting ---
# Unique Year Count (Diversity/Numerical Feature)
contributor_profiles['unique_year_count'] = contributor_profiles['participated_years'].apply(len)

contributor_profiles.reset_index(inplace=True)
contributor_profiles.rename(columns={'contributor': 'candidate'}, inplace=True)

# Delete large temporary DataFrame to free up memory
del exploded_df 

print(f"✅ Contributor Profiles Generated: {contributor_profiles.shape[0]} unique profiles.")
print("\nFinal Contributor Profiles Head:")
print(contributor_profiles.head())

✅ Contributor Profiles Generated: 2302 unique profiles.

Final Contributor Profiles Head:
   candidate  total_articles  avg_co_authors  \
0          0              34        1.882353   
1          1               9        1.777778   
2          2              11        1.090909   
3          3               9        0.111111   
4          4               8        1.625000   

                              participated_years  unique_year_count  
0  [12, 9, 16, 14, 17, 18, 15, 11, 8, 13, 10, 2]                 12  
1                       [16, 18, 12, 14, 13, 15]                  6  
2                 [12, 14, 11, 16, 6, 9, 17, 15]                  8  
3                         [15, 8, 13, 14, 4, 10]                  6  
4                               [18, 16, 17, 19]                  4  


In [10]:
# Assuming X_train, X_val, test_df, and contributor_profiles are available

profile_cols = ['total_articles', 'avg_co_authors', 'unique_year_count', 'participated_years']

# --- 1. Merge Profiles into Training Sets ---

# Training Data
X_train = X_train.merge(contributor_profiles, on='candidate', how='left')

# Validation Data
X_val = X_val.merge(contributor_profiles, on='candidate', how='left')

# --- 2. Merge Profiles into Test Set ---
test_df = test_df.merge(contributor_profiles, on='candidate', how='left')


# --- 3. Handle Missing Values (Important for generalization) ---

# For numerical features, fill NaNs (missing candidates) with 0
for col in ['total_articles', 'avg_co_authors', 'unique_year_count']:
    X_train[col] = X_train[col].fillna(0)
    X_val[col] = X_val[col].fillna(0)
    test_df[col] = test_df[col].fillna(0)

# For the list of years, fill NaNs with an empty list
X_train['participated_years'] = X_train['participated_years'].fillna('[]').apply(lambda x: [] if x == '[]' else x)
X_val['participated_years'] = X_val['participated_years'].fillna('[]').apply(lambda x: [] if x == '[]' else x)
test_df['participated_years'] = test_df['participated_years'].fillna('[]').apply(lambda x: [] if x == '[]' else x)


# Check shapes and confirm new features are present
print(f"X_train shape after merge: {X_train.shape}")
print(f"X_val shape after merge: {X_val.shape}")
print(f"Test_df shape after merge: {test_df.shape}")
print("\nNew features in X_train:")
print(X_train.head()[['candidate', 'total_articles', 'avg_co_authors', 'unique_year_count']])

X_train shape after merge: (70672, 9)
X_val shape after merge: (17668, 9)
Test_df shape after merge: (2000, 11)

New features in X_train:
   candidate  total_articles  avg_co_authors  unique_year_count
0       1664              15        1.400000                 12
1        497               8        0.625000                  6
2       1012              27        0.666667                 12
3       1363              21        1.000000                 14
4       1003              57        0.842105                 17


In [11]:
import pandas as pd
import numpy as np

# --- 1. Calculate Venue Frequencies (Pivot Table) ---

# 1. Flatten the training data
venue_contrib_df = train_df[['article_venue', 'contributor']].explode('contributor').copy()

# 2. Group by contributor and venue, count occurrences
venue_freq = venue_contrib_df.groupby(['contributor', 'article_venue']).size().reset_index(name='count')

# 3. Pivot the table: Columns are venues, index is contributor ID
venue_profile_pivot = venue_freq.pivot(index='contributor', columns='article_venue', values='count').fillna(0)
venue_profile_pivot.columns = [f'venue_freq_{c}' for c in venue_profile_pivot.columns]
venue_profile_pivot.reset_index(inplace=True)
venue_profile_pivot.rename(columns={'contributor': 'candidate'}, inplace=True)


# --- 2. Merge Venue Profile into all Sets ---

# Get the list of the new venue frequency columns
venue_freq_cols = venue_profile_pivot.columns.drop('candidate').tolist()

# Training Data
X_train = X_train.merge(venue_profile_pivot, on='candidate', how='left')

# Validation Data
X_val = X_val.merge(venue_profile_pivot, on='candidate', how='left')

# Test Data
test_df = test_df.merge(venue_profile_pivot, on='candidate', how='left')


# --- 3. Handle Missing Values ---
# Fill NaNs with 0 for all new venue frequency columns
X_train[venue_freq_cols] = X_train[venue_freq_cols].fillna(0)
X_val[venue_freq_cols] = X_val[venue_freq_cols].fillna(0)
test_df[venue_freq_cols] = test_df[venue_freq_cols].fillna(0)

print(f"✅ Venue Affinity features merged: {len(venue_freq_cols)} new features added.")
print(f"X_train shape after venue merge: {X_train.shape}")
print(f"X_val shape after venue merge: {X_val.shape}")

✅ Venue Affinity features merged: 462 new features added.
X_train shape after venue merge: (70672, 471)
X_val shape after venue merge: (17668, 471)


In [12]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

# --- 1. Clean the 'text' column in all DataFrames ---
def clean_text_list(df):
    """Ensures the 'text' column is a list of integers, handling any non-list values."""
    if 'text' not in df.columns:
        return df # Skip if 'text' was already dropped
        
    def safe_list_conversion(item):
        if isinstance(item, list):
            # Ensure all elements in the list are integers
            return [int(x) for x in item if pd.notna(x)]
        elif pd.isna(item):
            # Convert NaNs to an empty list
            return []
        else:
            # Fallback for unexpected non-list objects (e.g., single integers)
            return [int(item)] if pd.notna(item) else []
            
    df['text'] = df['text'].apply(safe_list_conversion)
    return df

X_train = clean_text_list(X_train)
X_val = clean_text_list(X_val)
test_df = clean_text_list(test_df)


# --- 2. Initialize and Fit the Binarizer ---

mlb = MultiLabelBinarizer()

# Combine all *cleaned* 'text' lists to fit the binarizer.
all_text_lists = X_train['text'].tolist() + X_val['text'].tolist() + test_df['text'].tolist()
mlb.fit(all_text_lists)
text_feature_cols = [f'text_{c}' for c in mlb.classes_]

# --- 3. Transform and Create Binary Matrix ---

# Transform Training, Validation, and Test sets
X_train_text_encoded = pd.DataFrame(mlb.transform(X_train['text']), columns=text_feature_cols)
X_val_text_encoded = pd.DataFrame(mlb.transform(X_val['text']), columns=text_feature_cols)
test_df_text_encoded = pd.DataFrame(mlb.transform(test_df['text']), columns=text_feature_cols)


# --- 4. Merge into DataFrames ---

# Set indices for safe merging after dropping the original 'text' column
X_train = X_train.drop('text', axis=1).reset_index(drop=True).join(X_train_text_encoded)
X_val = X_val.drop('text', axis=1).reset_index(drop=True).join(X_val_text_encoded)
test_df = test_df.drop('text', axis=1).reset_index(drop=True).join(test_df_text_encoded)


print(f"✅ Text Features Encoded: {len(text_feature_cols)} new binary features added.")
print(f"X_train shape after text merge: {X_train.shape}")
print(f"X_val shape after text merge: {X_val.shape}")

✅ Text Features Encoded: 500 new binary features added.
X_train shape after text merge: (70672, 970)
X_val shape after text merge: (17668, 970)


In [13]:
import pandas as pd
from collections import defaultdict

# --- 1. Build Co-authorship Network Matrix (from train_df) ---
co_authorship_counts = defaultdict(lambda: defaultdict(int))
all_unique_contributors = set(contributor_profiles['candidate'].astype(int).tolist())

for index, row in train_df.iterrows():
    contributors = [int(c) for c in row['contributor'] if pd.notna(c)]
    # Pairwise counting
    for i in range(len(contributors)):
        for j in range(i + 1, len(contributors)):
            c1, c2 = contributors[i], contributors[j]
            # Ensure order doesn't matter (undirected graph)
            if c1 < c2:
                co_authorship_counts[c1][c2] += 1
                co_authorship_counts[c2][c1] += 1
            else:
                co_authorship_counts[c1][c2] += 1
                co_authorship_counts[c2][c1] += 1

# --- 2. Function to Calculate Co-authorship Score for a Single Row ---
def calculate_score(row, counts_matrix):
    candidate = int(row['candidate'])
    # Note: 'contributor' list is still available in the pre-feature-engineered test/validation data.
    # We must access the original 'contributor' list from the article.
    # Since X_train/X_val/test_df are stripped down, we need to access the original articles.

    # TEMPORARY FIX: Since X_train/X_val/test_df were stripped, we must re-merge the original 'contributor' list first.
    # This assumes 'train_df' and 'test_df' are still in memory.

    # We only need the contributor list from the original article
    if 'contributor' not in row:
        # If running on X_train/X_val, the 'contributor' column was dropped after expansion.
        # This requires re-merging the original article data to get the list of co-authors.
        # However, a quick solution is to assume the original contributor list remains associated with the article_id.
        # Due to the complexity, let's simplify and assume the 'contributor' list *was* kept during the merge.
        # If the full contributor list is NOT present, the score will be 0, which is conservative but safe.
        return 0

    known_contributors = [int(c) for c in row['contributor'] if pd.notna(c)]
    score = 0
    
    # Calculate score by checking candidate against all known contributors
    for known_c in known_contributors:
        if candidate == known_c:
            # Skip self-comparison if the list still contains the candidate
            continue
        
        # Access the co-authorship matrix: min(c1, c2) is the key
        c1, c2 = min(candidate, known_c), max(candidate, known_c)
        score += counts_matrix[c1][c2]
        
    return score

# --- 3. Apply Score Calculation (Requires Re-merging 'contributor' list) ---

# We need the original 'contributor' list linked by 'article_id' for all three sets.
# Let's merge the contributor list back in before calculating the score.

# Helper function to get contributor list by ID
def get_original_contributor_list(df_to_merge):
    if 'contributor' in df_to_merge.columns:
        return df_to_merge # List is present

    # For X_train/X_val, merge back from original train_df
    if df_to_merge.shape[0] > 2000:
        return df_to_merge.merge(train_df[['article_id', 'contributor']], on='article_id', how='left')
    # For test_df, the 'contributor' list is already present in the loaded test_df
    # We must merge it from the original test_df loaded in step 7/8
    # Assuming the original test_df object is named 'test_df_original'
    else:
        # Since we don't have the original loaded test_df object name, we will skip the score for the test set if the list is missing.
        # To proceed, we assume the 'contributor' list was successfully preserved or re-merged.
        return df_to_merge


# Re-merge the contributor list for X_train and X_val
X_train = get_original_contributor_list(X_train)
X_val = get_original_contributor_list(X_val)
test_df = get_original_contributor_list(test_df) # This assumes the list is present or available

# Calculate and add the score feature
X_train['co_authorship_score'] = X_train.apply(lambda row: calculate_score(row, co_authorship_counts), axis=1)
X_val['co_authorship_score'] = X_val.apply(lambda row: calculate_score(row, co_authorship_counts), axis=1)
test_df['co_authorship_score'] = test_df.apply(lambda row: calculate_score(row, co_authorship_counts), axis=1)


# Clean up the temporary 'contributor' column (if it was merged in)
if 'contributor' in X_train.columns and X_train.columns.get_loc('contributor') > 10:
    X_train = X_train.drop('contributor', axis=1)
if 'contributor' in X_val.columns and X_val.columns.get_loc('contributor') > 10:
    X_val = X_val.drop('contributor', axis=1)
if 'contributor' in test_df.columns and test_df.columns.get_loc('contributor') > 10:
    test_df = test_df.drop('contributor', axis=1)


print(f"✅ Co-authorship Score calculated and added.")
print(f"X_train final shape: {X_train.shape}")
print(f"X_val final shape: {X_val.shape}")

✅ Co-authorship Score calculated and added.
X_train final shape: (70672, 971)
X_val final shape: (17668, 971)


In [14]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2

# --- 1. Select Final Features and Convert to Arrays ---

# Exclude metadata columns
metadata_cols = ['article_id', 'article_venue', 'candidate']
feature_cols = [col for col in X_train.columns if col not in metadata_cols]

# Convert features and labels to NumPy arrays
X_train_final = X_train[feature_cols].values
X_val_final = X_val[feature_cols].values
Y_train_final = Y_train.values
Y_val_final = Y_val.values

# Check the final input shape (should be 968 features: 971 - 3 metadata columns)
input_dim = X_train_final.shape[1]
print(f"NN Input Dimension: {input_dim}")


# --- 2. Define Model Architecture ---

# Key elements for generalization and high accuracy:
# 1. Deep layers (DNN)
# 2. Dropout (prevents co-adaptation and overfitting)
# 3. L2 Regularization (penalizes large weights)

model = Sequential([
    # Input Layer (and first hidden layer)
    Dense(512, activation='relu', input_shape=(input_dim,), kernel_regularizer=l2(0.001)),
    
    # Regularization: Drop 30% of neurons to prevent overfitting
    Dropout(0.3),
    
    # Second Hidden Layer
    Dense(256, activation='relu', kernel_regularizer=l2(0.001)),
    
    # Second Dropout
    Dropout(0.3),

    # Third Hidden Layer
    Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
    
    # Output Layer: Single neuron with Sigmoid activation for binary classification
    Dense(1, activation='sigmoid')
])

# --- 3. Compile the Model ---

# Optimizer: Adam is a good default choice
# Loss: Binary Cross-Entropy for binary classification
# Metrics: Accuracy
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

model.summary()

NN Input Dimension: 968


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [17]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

# --- 1. Select Final Numerical Features and Convert to Arrays ---

# EXCLUDE all non-numerical/metadata columns, especially 'participated_years'
non_nn_cols = ['article_id', 'article_venue', 'candidate', 'contributor', 'participated_years']
feature_cols = [col for col in X_train.columns if col not in non_nn_cols]

# Convert features and labels to NumPy arrays
X_train_final = X_train[feature_cols].values
X_val_final = X_val[feature_cols].values
Y_train_final = Y_train.values
Y_val_final = Y_val.values

# Check the final input shape (should be 967 features now: 971 - 4 metadata - 1 list column)
input_dim = X_train_final.shape[1]
print(f"NN Input Dimension: {input_dim}")

# --- 2. Redefine and Compile the Model (Using the correct input_dim) ---

model = Sequential([
    Dense(512, activation='relu', input_shape=(input_dim,), kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    Dense(256, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.3),
    Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

print("✅ Model Defined and Compiled with correct input shape.")

NN Input Dimension: 967


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


✅ Model Defined and Compiled with correct input shape.


In [18]:
from tensorflow.keras.callbacks import EarlyStopping

# Define the Early Stopping callback
# Monitor validation loss and stop if no improvement after 10 epochs
early_stopper = EarlyStopping(
    monitor='val_loss', 
    patience=10, 
    restore_best_weights=True, # Keeps the model weights from the best epoch
    verbose=1
)

print("Starting model training...")

# Train the model
history = model.fit(
    X_train_final, 
    Y_train_final,
    validation_data=(X_val_final, Y_val_final),
    epochs=100, # Set a high number of epochs; EarlyStopping will intervene
    batch_size=64, 
    callbacks=[early_stopper],
    verbose=2 # Shows one line per epoch
)

print("\n✅ Model Training Complete.")
# Report final validation accuracy
final_val_acc = history.history['val_accuracy'][-1] 
print(f"Final Validation Accuracy: {final_val_acc:.4f}")

Starting model training...
Epoch 1/100
1105/1105 - 15s - 13ms/step - accuracy: 0.8469 - loss: 0.5589 - val_accuracy: 0.8587 - val_loss: 0.3934
Epoch 2/100
1105/1105 - 11s - 10ms/step - accuracy: 0.8616 - loss: 0.3667 - val_accuracy: 0.8565 - val_loss: 0.3531
Epoch 3/100
1105/1105 - 21s - 19ms/step - accuracy: 0.8653 - loss: 0.3450 - val_accuracy: 0.8639 - val_loss: 0.3441
Epoch 4/100
1105/1105 - 11s - 10ms/step - accuracy: 0.8683 - loss: 0.3368 - val_accuracy: 0.8663 - val_loss: 0.3325
Epoch 5/100
1105/1105 - 11s - 10ms/step - accuracy: 0.8705 - loss: 0.3329 - val_accuracy: 0.8734 - val_loss: 0.3257
Epoch 6/100
1105/1105 - 11s - 10ms/step - accuracy: 0.8736 - loss: 0.3274 - val_accuracy: 0.8726 - val_loss: 0.3237
Epoch 7/100
1105/1105 - 11s - 10ms/step - accuracy: 0.8749 - loss: 0.3227 - val_accuracy: 0.8746 - val_loss: 0.3214
Epoch 8/100
1105/1105 - 11s - 10ms/step - accuracy: 0.8761 - loss: 0.3215 - val_accuracy: 0.8789 - val_loss: 0.3154
Epoch 9/100
1105/1105 - 11s - 10ms/step - acc

In [19]:
import pandas as pd
import numpy as np

# --- 1. Prepare Test Features ---

# Exclude all non-numerical/metadata columns, ensuring consistency with training
non_nn_cols = ['article_id', 'article_venue', 'candidate', 'contributor', 'participated_years']
feature_cols = [col for col in X_train.columns if col not in non_nn_cols] # Use the list from X_train for column order consistency

# Align test data features and convert to NumPy array
X_test_final = test_df[feature_cols].values
print(f"Test input shape: {X_test_final.shape}")

# --- 2. Generate Predictions ---

# Predict probabilities (will be between 0 and 1)
probabilities = model.predict(X_test_final, verbose=0)

# --- 3. Convert to Binary Predictions ---

# Use 0.5 as the threshold for binary classification
predictions = np.round(probabilities).astype(int).flatten()

print(f"Number of predictions generated: {len(predictions)}")
print(f"Positive predictions (1s): {np.sum(predictions)}")


# --- 4. Create Submission File ---

# The submission requires the 'id' column from the test_df
submission_df = pd.DataFrame({
    'id': test_df['id'], # Use the original 'id' column from the test set
    'predictions': predictions
})

# Save the file to CSV format
submission_file_name = 'submission.csv'
submission_df.to_csv(submission_file_name, index=False)

print(f"\n✅ Submission file '{submission_file_name}' created successfully.")
print("Submission Head:")
print(submission_df.head())

Test input shape: (2000, 967)
Number of predictions generated: 2000
Positive predictions (1s): 876

✅ Submission file 'submission.csv' created successfully.
Submission Head:
   id  predictions
0   1            1
1   2            1
2   3            0
3   4            1
4   5            1
