In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [9]:
def load_datasets():
    columns = ['id', 'label', 'statement', 'subject', 'speaker', 'speaker_title','state_info', 'party_affiliation', 'barely_true_counts','false_counts', 'half_true_counts', 'mostly_true_counts','pants_fire_counts', 'context']
    train_df = pd.read_csv('../data/raw/train.tsv', sep='\t', header=None, names=columns)
    test_df = pd.read_csv('../data/raw/test.tsv', sep='\t', header=None, names=columns)
    valid_df = pd.read_csv('../data/raw/valid.tsv', sep='\t', header=None, names=columns)
    
    print("Dataset loaded successfully!")
    print(f"Training set: {train_df.shape}")
    print(f"Test set: {test_df.shape}")
    print(f"Validation set: {valid_df.shape}")
    
    return train_df, test_df, valid_df

# Call the function to load the data
train_df, test_df, valid_df = load_datasets()

Dataset loaded successfully!
Training set: (10240, 14)
Test set: (1267, 14)
Validation set: (1284, 14)


In [12]:
# Basic exploration
print("\\n=== BASIC DATA INFO ===")
print(f"Total samples: {len(train_df) + len(test_df) + len(valid_df)}")
print(f"Features: {len(train_df.columns)}")

print("\\n=== LABEL DISTRIBUTION (Training) ===")
print(train_df['label'].value_counts())

print("\\n=== MISSING VALUES ===")
print(train_df.isnull().sum()[train_df.isnull().sum() > 0])

\n=== BASIC DATA INFO ===
Total samples: 12791
Features: 14
\n=== LABEL DISTRIBUTION (Training) ===
label
half-true      2114
false          1995
mostly-true    1962
true           1676
barely-true    1654
pants-fire      839
Name: count, dtype: int64
\n=== MISSING VALUES ===
subject                  2
speaker                  2
speaker_title         2898
state_info            2210
party_affiliation        2
barely_true_counts       2
false_counts             2
half_true_counts         2
mostly_true_counts       2
pants_fire_counts        2
context                102
dtype: int64


In [15]:
# Save processed data for team members
train_df.to_csv('../data/processed/train_processed.csv', index=False)
test_df.to_csv('../data/processed/test_processed.csv', index=False)
valid_df.to_csv('../data/processed/valid_processed.csv', index=False)