# Using basic features

In [81]:
from fastai.tabular.all import *
import os 
import pandas as pd

In [82]:
data = pd.read_csv(os.path.join('..', 'input', 'train.csv'))

In [83]:
data = data.drop(columns=  ['pre requisite', 'concept', 'pre requisite taxonomy','concept taxonomy'])
data['label'] = data['label'].astype('category')

In [84]:
splits = RandomSplitter(valid_pct=0.2)(range_of(data))

In [85]:
to = TabularPandas(data, procs=[Categorify, FillMissing,Normalize],
                   cont_names = [col for col in data.columns if col not in ['label']],
                   y_names='label',
                   splits=splits)

In [86]:
to.xs.iloc[:2]


Unnamed: 0,title 12,title 12.1,title 12.2,title 12.3,title 12.4,title 12.5,title 12.6,title 12.7,title 12.8,title 12.9,...,BOW 656,BOW 657,BOW 658,BOW 659,BOW 660,BOW 661,BOW 662,BOW 663,BOW 664,BOW 665
504,0.916657,-1.026096,-1.902946,-1.895747,-1.763421,-1.616443,-1.195742,0.186429,0.629783,0.957182,...,-0.041551,-0.059895,-0.133174,-0.100165,-0.10795,-0.076437,-0.07713,-0.296635,-0.116563,-0.096318
1527,0.344338,1.325571,0.351219,0.354535,0.426706,0.454704,0.403439,-0.614392,-0.536984,-0.543991,...,-0.041551,-0.059895,-0.133174,-0.100165,-0.10795,-0.076437,-0.07713,-0.296635,-0.116563,-0.096318


In [87]:
dls = to.dataloaders(bs=64)

In [88]:
learn = tabular_learner(dls, metrics=[accuracy, F1Score()])

In [89]:
learn.fit_one_cycle(5)

epoch,train_loss,valid_loss,accuracy,f1_score,time
0,0.578705,0.502656,0.774597,0.741803,00:00
1,0.439156,0.578454,0.765653,0.719486,00:00
2,0.330821,0.609222,0.796064,0.759494,00:00
3,0.242825,0.641281,0.778175,0.726872,00:00
4,0.188765,0.643456,0.790698,0.741722,00:00


In [91]:

test = pd.read_csv(os.path.join('..', 'input', 'test.csv'))
test_data = test.drop(columns=  ['ID', 'pre requisite', 'concept', 'pre requisite taxonomy','concept taxonomy'])
# test_data.columns
test_df = test_data.copy()
test_dl = learn.dls.test_dl(test_df)
preds = learn.get_preds(dl=test_dl)
test_labels = torch.argmax(preds[0], dim=1)
# create a dataframe with the predictions
df = pd.DataFrame(test_labels.numpy())
df.columns = ['label']
df['ID'] = test['ID']
df = df[['ID', 'label']]
df.to_csv(os.path.join('..', 'output', 'fastai_preds_2.csv'), index=False)

# Using GPT Embeddings

In [104]:
# load torch tensors

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from fastai.tabular.all import *
import os 
import pandas as pd

x = torch.load(os.path.join('..', 'output', 'embeddings', 'train_embeds_transcripts_concatenated.pt'))
y = x.to_list()
y = [t.tolist() for t in y]
df = pd.DataFrame(y)
column_names = [f"embeds_{i}" for i in range(1536)]  # Replace with your column names
df.columns = column_names
train = pd.read_csv(os.path.join('..', 'input', 'train.csv'))
data = pd.concat([train, df], axis=1)
len(data.columns)

2522

In [105]:
# data = pd.read_csv(os.path.join('..', 'input', 'train.csv'))
data = data.drop(columns=  ['pre requisite', 'concept', 'pre requisite taxonomy','concept taxonomy'])
data['label'] = data['label'].astype('category')
splits = RandomSplitter(valid_pct=0.2)(range_of(data))
to = TabularPandas(data, procs=[Categorify, FillMissing,Normalize],
                   cont_names = [col for col in data.columns if col not in ['label']],
                   y_names='label',
                   splits=splits)
to.xs.iloc[:2]

dls = to.dataloaders(bs=64)
learn = tabular_learner(dls, metrics=[accuracy, F1Score()])
learn.fit_one_cycle(1)


epoch,train_loss,valid_loss,accuracy,f1_score,time
0,0.540995,0.48603,0.779964,0.735484,00:00


In [106]:

x = torch.load(os.path.join('..', 'output', 'embeddings', 'test_embeds_transcripts_concatenated.pt'))
y = x.to_list()
y = [t.tolist() for t in y]
df = pd.DataFrame(y)
column_names = [f"embeds_{i}" for i in range(1536)]  # Replace with your column names
df.columns = column_names
test = pd.read_csv(os.path.join('..', 'input', 'test.csv'))
test_data = pd.concat([test, df], axis=1)
test_data = test_data.drop(columns=  ['ID', 'pre requisite', 'concept', 'pre requisite taxonomy','concept taxonomy'])
# test_data.columns

In [107]:
test_df = test_data.copy()
test_dl = learn.dls.test_dl(test_df)

In [108]:
preds = learn.get_preds(dl=test_dl)

In [109]:
test_labels = torch.argmax(preds[0], dim=1)

In [110]:
# create a dataframe with the predictions
df = pd.DataFrame(test_labels.numpy())
df.columns = ['label']
df['ID'] = test['ID']
df = df[['ID', 'label']]
df.to_csv(os.path.join('..', 'output', 'fastai_preds.csv'), index=False)

## Sub exp 

Dont use their features at all

In [20]:
# load torch tensors

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from fastai.tabular.all import *
import os 
import pandas as pd

x = torch.load(os.path.join('..', 'output', 'embeddings', 'train_embeds_transcripts_concatenated.pt'))
y = x.to_list()
y = [t.tolist() for t in y]
df = pd.DataFrame(y)
column_names = [f"embeds_{i}" for i in range(1536)]  # Replace with your column names
df.columns = column_names
train = pd.read_csv(os.path.join('..', 'input', 'train.csv'))
data = pd.concat([train['label'].to_frame(), df], axis=1)
# len(data.columns)
# data = data.drop(columns=  ['pre requisite', 'concept', 'pre requisite taxonomy','concept taxonomy'])
data['label'] = data['label'].astype('category')
splits = RandomSplitter(valid_pct=0.2)(range_of(data))
to = TabularPandas(data, procs=[Categorify, FillMissing,Normalize],
                   cont_names = [col for col in data.columns if col not in ['label']],
                   y_names='label',
                   splits=splits)
to.xs.iloc[:2]

dls = to.dataloaders(bs=128)
learn = tabular_learner(dls, metrics=[accuracy, F1Score()])
learn.fit_one_cycle(10)


x = torch.load(os.path.join('..', 'output', 'embeddings', 'test_embeds_transcripts_concatenated.pt'))
y = x.to_list()
y = [t.tolist() for t in y]
df = pd.DataFrame(y)
column_names = [f"embeds_{i}" for i in range(1536)]  # Replace with your column names
df.columns = column_names
test = pd.read_csv(os.path.join('..', 'input', 'test.csv'))
# test_data = pd.concat([test['label'].to_frame(), df], axis=1)
test_data = df
# test_data = test_data.drop(columns=  ['ID', 'pre requisite', 'concept', 'pre requisite taxonomy','concept taxonomy'])
# test_data.columns
test_df = test_data.copy()
test_dl = learn.dls.test_dl(test_df)
preds = learn.get_preds(dl=test_dl)
test_labels = torch.argmax(preds[0], dim=1)
# create a dataframe with the predictions
df = pd.DataFrame(test_labels.numpy())
df.columns = ['label']
df['ID'] = test['ID']
df = df[['ID', 'label']]
df.to_csv(os.path.join('..', 'output', 'fastai_preds-3.csv'), index=False)

epoch,train_loss,valid_loss,accuracy,f1_score,time
0,0.654,0.555776,0.742397,0.689655,00:00
1,0.515539,0.46972,0.763864,0.714286,00:00
2,0.42694,0.448063,0.797853,0.75803,00:00
3,0.357721,0.458975,0.797853,0.754881,00:00
4,0.288818,0.424933,0.833631,0.792873,00:00
5,0.228815,0.44528,0.817531,0.77533,00:00
6,0.180121,0.440286,0.833631,0.796499,00:00
7,0.139765,0.445322,0.838998,0.804348,00:00
8,0.108029,0.45179,0.840787,0.803532,00:00
9,0.085489,0.456635,0.831843,0.795652,00:00


## Exp 3
Not using BOW features

In [16]:
# load torch tensors

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from fastai.tabular.all import *
import os 
import pandas as pd

x = torch.load(os.path.join('..', 'output', 'embeddings', 'train_embeds_transcripts_concatenated.pt'))
y = x.to_list()
y = [t.tolist() for t in y]
df = pd.DataFrame(y)
column_names = [f"embeds_{i}" for i in range(1536)]  # Replace with your column names
df.columns = column_names
train = pd.read_csv(os.path.join('..', 'input', 'train.csv'))
data = pd.concat([train, df], axis=1)
len(data.columns)
# data = pd.read_csv(os.path.join('..', 'input', 'train.csv'))
data = data.drop(columns=  ['pre requisite', 'concept', 'pre requisite taxonomy','concept taxonomy'])
data = data.drop(columns=  [x for x in data.columns if 'BOW' in x])
print(len(data.columns))
data['label'] = data['label'].astype('category')
splits = RandomSplitter(valid_pct=0.2)(range_of(data))
to = TabularPandas(data, procs=[Categorify, FillMissing,Normalize],
                   cont_names = [col for col in data.columns if col not in ['label']],
                   y_names='label',
                   splits=splits)


dls = to.dataloaders(bs=64)
learn = tabular_learner(dls, metrics=[accuracy, F1Score()])
learn.fit_one_cycle(50)


x = torch.load(os.path.join('..', 'output', 'embeddings', 'test_embeds_transcripts_concatenated.pt'))
y = x.to_list()
y = [t.tolist() for t in y]
df = pd.DataFrame(y)
column_names = [f"embeds_{i}" for i in range(1536)]  # Replace with your column names
df.columns = column_names
test = pd.read_csv(os.path.join('..', 'input', 'test.csv'))
test_data = pd.concat([test, df], axis=1)
test_data = test_data.drop(columns=  ['ID', 'pre requisite', 'concept', 'pre requisite taxonomy','concept taxonomy'])
test_data = test_data.drop(columns=  [x for x in data.columns if 'BOW' in x])

# test_data.columns
test_df = test_data.copy()
test_dl = learn.dls.test_dl(test_df)
preds = learn.get_preds(dl=test_dl)
test_labels = torch.argmax(preds[0], dim=1)
# create a dataframe with the predictions
df = pd.DataFrame(test_labels.numpy())
df.columns = ['label']
df['ID'] = test['ID']
df = df[['ID', 'label']]
df.to_csv(os.path.join('..', 'output', 'fastai_preds-4.csv'), index=False)

1853


epoch,train_loss,valid_loss,accuracy,f1_score,time
0,0.61483,0.556171,0.713775,0.663866,00:01
1,0.507161,0.484939,0.772809,0.730361,00:01
2,0.424267,0.425161,0.80322,0.754464,00:01
3,0.347808,0.38367,0.81932,0.776053,00:01
4,0.284108,0.388661,0.828265,0.778802,00:01
5,0.232615,0.392163,0.826476,0.782022,00:01
6,0.214863,0.560391,0.785331,0.736842,00:01
7,0.20971,0.416251,0.840787,0.796339,00:01
8,0.191507,0.443458,0.83542,0.787037,00:01
9,0.168587,0.481501,0.846154,0.795238,00:01
