# Data Quickstart & Sanity Checks
Use this notebook to confirm required datasets exist and preview a few rows per domain before running flows.

In [1]:
# Path setup
import sys
from pathlib import Path
project_root = Path('..').resolve()
sys.path.append(str(project_root / 'src'))
print('Project root:', project_root)

Project root: /Users/pratik_n/Desktop/MyComputer/universal-anomaly-intelligence


## Expected raw data paths
Update this list if you use different filenames.

In [2]:
from pprint import pprint
paths = {
    'fraud': project_root / 'data/raw/fraud/creditcard.csv',
    'cyber': project_root / 'data/raw/cyber/kitsune_mirai.csv',
    'behavior': project_root / 'data/raw/behavior/online_shoppers_intention.csv',
    'nlp_enron': project_root / 'data/raw/nlp/enron_emails.csv',
    'nlp_fakenews_dir': project_root / 'data/raw/nlp/fakenews',
    'vision_root': project_root / 'data/raw/vision',
}
missing = {k: p for k, p in paths.items() if not p.exists()}
print('Missing paths:' if missing else 'All listed paths exist')
pprint(missing)

Missing paths:
{'cyber': PosixPath('/Users/pratik_n/Desktop/MyComputer/universal-anomaly-intelligence/data/raw/cyber/kitsune_mirai.csv'),
 'nlp_enron': PosixPath('/Users/pratik_n/Desktop/MyComputer/universal-anomaly-intelligence/data/raw/nlp/enron_emails.csv')}


## Preview tabular datasets (fraud, cyber, behavior)
Guarded reads: skips if file not found.

In [3]:
import pandas as pd
def safe_head(path, n=5):
    if not path.exists():
        print(f'Missing: {path}')
        return
    try:
        df = pd.read_csv(path)
        print(f'File: {path.name}, rows={len(df)}, cols={len(df.columns)}')
        display(df.head(n))
    except Exception as e:
        print(f'Failed to read {path}:', e)

safe_head(paths['fraud'])
safe_head(paths['cyber'])
safe_head(paths['behavior'])

File: creditcard.csv, rows=284807, cols=31


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


Missing: /Users/pratik_n/Desktop/MyComputer/universal-anomaly-intelligence/data/raw/cyber/kitsune_mirai.csv
File: online_shoppers_intention.csv, rows=12330, cols=18


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


## Preview NLP (Enron or Fake News)
Shows a few texts if available.

In [4]:
from uais.data.load_datasets import load_enron_emails
try:
    df_enron = load_enron_emails(subset=3)
    display(df_enron.head(3))
except Exception as e:
    print('Enron load skipped:', e)

fake_news_dir = paths['nlp_fakenews_dir']
if fake_news_dir.exists():
    fake_csv = list(fake_news_dir.rglob('Fake.csv'))
    true_csv = list(fake_news_dir.rglob('True.csv'))
    if fake_csv and true_csv:
        try:
            import pandas as pd
            df_fake = pd.read_csv(fake_csv[0]).head(2)
            df_true = pd.read_csv(true_csv[0]).head(2)
            display(df_fake.head(2))
            display(df_true.head(2))
        except Exception as e:
            print('Fake/True preview skipped:', e)
else:
    print('Fake news dataset not found')

Enron load skipped: Enron CSV not found at /Users/pratik_n/Desktop/MyComputer/universal-anomaly-intelligence/data/raw/nlp/enron_emails.csv. Run download_nlp_vision.py first.


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"


Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"


## Preview vision layout
Lists a few subfolders/files (no heavy image load).

In [5]:
vision_root = paths['vision_root']
if vision_root.exists():
    entries = sorted([p for p in vision_root.iterdir() if p.is_dir()])[:5]
    print('Vision subdirs:', [e.name for e in entries])
else:
    print('Vision root not found')

Vision subdirs: ['datasets', 'train_fake', 'train_real']


## Optional: CIFAR-10 shape check (if downloaded via download_nlp_vision)

In [6]:
try:
    from uais.data.load_datasets import load_cifar10
    X, y = load_cifar10('train')
    print('CIFAR-10 train shape:', X.shape, y.shape)
except Exception as e:
    print('CIFAR check skipped:', e)

CIFAR check skipped: CIFAR-10 directory not found at /Users/pratik_n/Desktop/MyComputer/universal-anomaly-intelligence/data/raw/vision/cifar-10-python. Run download_nlp_vision.py first.


## Load all CSVs per domain (quick head)
Attempts to read every CSV under each domain folder and show the first few rows. Skips on errors/large files.

In [7]:
import pandas as pd
from itertools import islice
domains_dirs = {
    'fraud': project_root / 'data/raw/fraud',
    'cyber': project_root / 'data/raw/cyber',
    'behavior': project_root / 'data/raw/behavior',
    'nlp': project_root / 'data/raw/nlp',
    'vision': project_root / 'data/raw/vision',
}
for dom, root in domains_dirs.items():
    print(f"\n== {dom.upper()} ==")
    if not root.exists():
        print('missing root', root)
        continue
    csvs = list(root.rglob('*.csv'))
    if not csvs:
        print('no CSVs found')
        continue
    for csv_path in islice(csvs, 5):  # cap to 5 files to keep it light
        try:
            df_tmp = pd.read_csv(csv_path)
            print(f"{csv_path.name}: rows={len(df_tmp)}, cols={len(df_tmp.columns)}")
            display(df_tmp.head(2))
        except Exception as e:
            print(f"skip {csv_path.name}: {e}")
    if len(csvs) > 5:
        print(f"...skipped {len(csvs)-5} more files")


== FRAUD ==
creditcard.csv: rows=284807, cols=31


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0


paysim_transactions.csv: rows=6362620, cols=11


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0



== CYBER ==
UNSW-NB15_3.csv: rows=700000, cols=49


Unnamed: 0,59.166.0.1,18247,149.171.126.4,7662,tcp,FIN,0.119596,4550,68342,31,...,Unnamed: 12,6,2,2.1,5,1,1.1,2.2,Unnamed: 47,0.4
0,59.166.0.3,54771,149.171.126.2,27709,tcp,FIN,0.650574,8928,320,31,...,,3,5,2,4,1,1,4,,0
1,59.166.0.8,13289,149.171.126.9,5190,tcp,FIN,0.00798,2158,2464,31,...,,3,5,1,1,1,1,3,,0


UNSW-NB15_2.csv: rows=700000, cols=49


  df_tmp = pd.read_csv(csv_path)


Unnamed: 0,59.166.0.0,6055,149.171.126.5,54145,tcp,FIN,0.072974,4238,60788,31,...,0.6,13,13.1,6,7.1,1,1.1,2,Unnamed: 47,0.7
0,59.166.0.0,7832,149.171.126.3,5607,tcp,FIN,0.144951,5174,91072,31,...,0,13,13,6,7,1,1,2,,0
1,59.166.0.8,11397,149.171.126.6,21,tcp,FIN,0.116107,2934,3742,31,...,1,1,2,7,5,1,1,4,,0


skip NUSW-NB15_features.csv: 'utf-8' codec can't decode byte 0x92 in position 1646: invalid start byte
UNSW-NB15_1.csv: rows=700000, cols=49


  df_tmp = pd.read_csv(csv_path)


Unnamed: 0,59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,...,0.17,3,7,1,3.1,1.1,1.2,1.3,Unnamed: 47,0.18
0,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,0,2,4,2,3,1,1,2,,0
1,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,0,12,8,1,2,2,1,1,,0


UNSW-NB15_4.csv: rows=440043, cols=49


Unnamed: 0,59.166.0.9,7045,149.171.126.7,25,tcp,FIN,0.201886,37552,3380,31,...,Unnamed: 12,2,2.1,7,4,1,1.1,3,Unnamed: 47,0.4
0,59.166.0.9,9685,149.171.126.2,80,tcp,FIN,5.864748,19410,1087890,31,...,,3,1,4,4,1,1,1,,0
1,59.166.0.2,1421,149.171.126.4,53,udp,CON,0.001391,146,178,31,...,,3,5,2,7,1,1,4,,0


...skipped 3 more files

== BEHAVIOR ==
online_shoppers_intention.csv: rows=12330, cols=18


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False


device.csv: rows=405380, cols=5


Unnamed: 0,id,date,user,pc,activity
0,{J1S3-L9UU75BQ-7790ATPL},01/02/2010 07:21:06,MOH0273,PC-6699,Connect
1,{N7B5-Y7BB27SI-2946PUJK},01/02/2010 07:37:41,MOH0273,PC-6699,Disconnect


psychometric.csv: rows=1000, cols=7


Unnamed: 0,employee_name,user_id,O,C,E,A,N
0,Calvin Edan Love,CEL0561,40,39,36,19,40
1,Christine Reagan Deleon,CRD0624,26,22,17,39,32


file.csv: rows=445581, cols=6


Unnamed: 0,id,date,user,pc,filename,content
0,{L9G8-J9QE34VM-2834VDPB},01/02/2010 07:23:14,MOH0273,PC-6699,EYPC9Y08.doc,D0-CF-11-E0-A1-B1-1A-E1 during difficulty over...
1,{H0W6-L4FG38XG-9897XTEN},01/02/2010 07:26:19,MOH0273,PC-6699,N3LTSU3O.pdf,25-50-44-46-2D carpenters 25 landed strait dis...


logon.csv: rows=854859, cols=5


Unnamed: 0,id,date,user,pc,activity
0,{X1D9-S0ES98JV-5357PWMI},01/02/2010 06:49:00,NGF0157,PC-6056,Logon
1,{G2B3-L6EJ61GT-2222RKSO},01/02/2010 06:50:00,LRR0148,PC-4275,Logon


...skipped 20 more files

== NLP ==
fake_news_labeled.csv: rows=44898, cols=2


Unnamed: 0,content,label
0,Donald Trump just couldn t wish all Americans ...,1
1,House Intelligence Committee Chairman Devin Nu...,1


Fake.csv: rows=23481, cols=4


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"


True.csv: rows=21417, cols=4


Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"



== VISION ==
no CSVs found


## Notes
- If a dataset is missing, use the download scripts under `src/uais/data/`.
- Re-run this notebook after adding data to confirm everything is in place.