In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## **IMPORTING THE DATASET**

In [2]:
df = pd.read_csv(r"C:\Users\sudhanshu\Downloads\Prajwal Sonekar\study_buddy_clean.csv")

In [3]:
df.shape

(87596, 7)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87596 entries, 0 to 87595
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   topic       87596 non-null  object
 1   context     87596 non-null  object
 2   question    87596 non-null  object
 3   answer      87596 non-null  object
 4   difficulty  87596 non-null  object
 5   attempted   87596 non-null  bool  
 6   correct     87596 non-null  int64 
dtypes: bool(1), int64(1), object(5)
memory usage: 4.1+ MB


In [5]:
df.isnull().sum()

topic         0
context       0
question      0
answer        0
difficulty    0
attempted     0
correct       0
dtype: int64

In [6]:
df = df.dropna(subset=['answer'])

## **STUDY NOTES SUMMARY**

In [7]:
import nltk
# nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize

def simple_summary(text, max_sentences=3):
    sentences = sent_tokenize(text)
    return " ".join(sentences[:max_sentences])

In [8]:
sample_context = df['context'].iloc[0]
simple_summary(sample_context)

'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".'

## **QUICK QUESTION ANSWERING**

In [9]:
df[['question', 'answer']].iloc[0]

question    To whom did the Virgin Mary allegedly appear i...
answer                             Saint Bernadette Soubirous
Name: 0, dtype: object

## **GENERATE FLASHCARDS**

In [10]:
def generate_flashcard(index):
    return {
        "Question": df.loc[index, 'question'],
        "Answer": df.loc[index, 'answer']
    }

In [11]:
generate_flashcard(10)

{'Question': 'Where is the headquarters of the Congregation of the Holy Cross?',
 'Answer': 'Rome'}

## **QUIZ GENERATOR**

In [12]:
quiz = df.sample(5)[['question', 'answer']]
quiz

Unnamed: 0,question,answer
53546,Who was part of Hollywood Ten?,Herbert Biberman
13596,What division of NBCUniversal revived Gramercy...,Focus Features
29659,Chinese troops attacked the UN forces when the...,the Yalu River
10416,"What Latin word does ""cardinal"" come from?",cardo
21519,Which train line used to have an 18th Street S...,IRT Lexington Avenue Line


## **SUMMARIES FOR EXAM USE**

In [13]:
from collections import Counter
import re
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize

stop_words = set(stopwords.words('english'))

def exam_focused_summary(text, max_sentences=3):
    sentences = sent_tokenize(text)

    words = re.findall(r'\w+', text.lower())
    words = [w for w in words if w not in stop_words]
    freq = Counter(words)

    sentence_scores = {}
    for sent in sentences:
        for word in re.findall(r'\w+', sent.lower()):
            if word in freq:
                sentence_scores[sent] = sentence_scores.get(sent, 0) + freq[word]

    ranked = sorted(sentence_scores, key=sentence_scores.get, reverse=True)
    return " ".join(ranked[:max_sentences])

In [14]:
exam_focused_summary(df['context'].iloc[0])

'At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary.'

## **TURN Q&A INTO FLASHCARDS PROPERLY**

In [15]:
def flashcard(index):
    return {
        "Front (Question)": df.loc[index, 'question'],
        "Back (Answer)": df.loc[index, 'answer'],
        "Source": "Study Buddy AI"
    }

In [16]:
flashcard(3)

{'Front (Question)': 'What is the Grotto at Notre Dame?',
 'Back (Answer)': 'a Marian place of prayer and reflection',
 'Source': 'Study Buddy AI'}

## **QUIZ GENERATION (ADD DIFFICULTY TAGS)**

In [17]:
def difficulty_level(answer):
    lenth = len(answer.split())
    if lenth <= 3:
        return "Easy"
    elif lenth <= 8:
        return "Medium"
    else:
        return "Hard"

In [18]:
df['difficulty'] = df['answer'].apply(difficulty_level)

## **GENERATE AN EXAM QUIZ**

In [19]:
def generate_quiz(level="Medium", n=5):
    quiz = df[df['difficulty'] == level].sample(n)
    return quiz[['question', 'answer', 'difficulty']]

In [20]:
generate_quiz("Easy")

Unnamed: 0,question,answer,difficulty
57394,Which disc could hold about three minutes of r...,10-inch,Easy
82046,What is the phenomenon where a P-N junction em...,electroluminescence,Easy
49779,What remained an important focus during the 19...,Communism,Easy
16077,What particle is associated with the yellowing...,lignin,Easy
12245,Which year did the USSR cancel the N1 rocket p...,1976,Easy


## **WEAK-TOPIC TRACKING (VERY IMPORTANT)**

In [21]:
import random

df['attempted'] = True
df['correct'] = [random.choice([0,1]) for _ in range(len(df))]

In [22]:
topic_performance = (
    df.groupby('difficulty')['correct']
    .mean()
    .reset_index(name='accuracy')
)

topic_performance

Unnamed: 0,difficulty,accuracy
0,Easy,0.497633
1,Hard,0.499037
2,Medium,0.488688


In [23]:
df.columns

Index(['topic', 'context', 'question', 'answer', 'difficulty', 'attempted',
       'correct'],
      dtype='object')

In [24]:
# df.to_csv("study_buddy_clean.csv", index=False)