In [5]:
#1.Text preprocessing:
import nltk
import spacy
nltk.download("stopwords")
nltk.download('punkt_tab')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from spacy.pipeline import EntityRuler
import re
import pandas as pd
nlp = spacy.load("en_core_web_sm")
# for sentiment analysis:
import transformers
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rimsha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\rimsha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rimsha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


In [2]:
# STEP NO 1: Define your input data
user_response = "Umm I think I am good at teamwo rk, because in my last job I worked with a team of 5 people to build a Python application at Google."

In [3]:
# STEP NO 2: Preprocessing / Cleaning:
# 1. LOWERCASE:
user_response_lower = user_response.lower()

# 2. Remove filler words (um, uh, you know, etc.):
filler_words = [
    "umm", "uh", "erm", "hmm",
    "like", "you know", "i mean", "actually",
    "basically", "literally", "seriously", 
    "okay", "ok", "so", "well",
    "right", "yeah", "yep", "y'know",
    "sort of", "kind of", "kinda",
    "just", "really", "anyway",
    "alright", "mm", "huh", "ah",
    "oh", "huh", "hmmm",
    "gotcha", "look", "see",
    "stuff", "things", "whatever"
]
pattern = r"\b(" + "|".join(map(re.escape, filler_words)) + r")\b"
user_response_no_fillers = re.sub(pattern, "", user_response_lower)
user_response_no_fillers = re.sub(r"\s+", " ", user_response_no_fillers).strip()

# 3.Remove punctuation/special chars(StopWords):
user_response_no_punc = re.sub(r'[^A-Za-z0-9\s]', '', user_response_no_fillers)

# 4. Tokenize (split text into words):
user_response_tokenize = word_tokenize(user_response_no_punc)

# 5. Lemmatize Using pos (reduce words to their base form: “worked” → “work”):
doc = nlp(" ".join(user_response_tokenize))
user_response_lemmatized = [token.lemma_ for token in doc]

# 6. Stopword removal (remove common useless words like “the”, “is”, “a”):
en_stopwords = set(stopwords.words("english"))
user_response_noStopwords = [    word.lower() for word in user_response_lemmatized if word.lower() not in en_stopwords]

In [6]:
# STEP NO 3: Feature Extraction:
# 1. Extract Keywords (important terms in answer):
doc_clean = nlp(" ".join(user_response_noStopwords))
keywords = [token.text for token in doc_clean if token.pos_ in ["NOUN", "PROPN", "VERB"]]

# 2. Extract Named Entities (NER):
ruler = nlp.add_pipe("entity_ruler", before="ner")
patterns = [{"label": "LANGUAGE", "pattern": "Python"},
            {"label": "LANGUAGE", "pattern": "Java"},
            {"label": "LANGUAGE", "pattern": "C++"}]
ruler.add_patterns(patterns)
doc_original = nlp(user_response)
named_entities = [(ent.text, ent.label_) for ent in doc_original.ents]

# 3. Detect Sentiment (tone of response):
sentiment_result = sentiment_pipeline(user_response)
sentiment_label = sentiment_result[0]['label']

'POSITIVE'

In [8]:
# STEP NO 4: Define Evaluation Rubric (early version):
# 1. Relevence check:
system_question = "Tell me about teamwork"
question_keywords = ["teamwork", "collaboration", "team"]
answer_keywords = keywords
relevance = int(any(word.lower() in question_keywords for word in answer_keywords))
print("Relevance:", relevance)

# 2. Clarity:
word_count = len(user_response_noStopwords)
has_entities = len(named_entities) > 0
clarity = int(word_count >= 5 and has_entities)
print("Clarity:", clarity)

# 3. Tone / Sentiment
tone = 1 if sentiment_label in ["POSITIVE", "NEUTRAL"] else 0
print("Tone:", tone)

# Combine into Rubric:
rubric = {
    "relevance": relevance,
    "clarity": clarity,
    "tone": tone
}
# Optional: overall score (0–3)
overall_score = sum(rubric.values())
print("Rubric:", rubric)
print("Overall score:", overall_score)

Relevance: 1
Clarity: 1
Tone: 1
Rubric: {'relevance': 1, 'clarity': 1, 'tone': 1}
Overall score: 3


In [10]:
# STEP NO 5: Organize Outputs
user_output = {
    "original_response": user_response,
    "cleaned_response": user_response_noStopwords,  # lemmatized + stopwords removed
    "tokenized_words": user_response_tokenize,
    "lemmatized_words": user_response_lemmatized,
    "keywords": keywords,
    "named_entities": named_entities,
    "sentiment_label": sentiment_label
}

# Display nicely
from pprint import pprint
pprint(user_output)


{'cleaned_response': ['think',
                      'good',
                      'teamwo',
                      'rk',
                      'last',
                      'job',
                      'work',
                      'team',
                      '5',
                      'people',
                      'build',
                      'python',
                      'application',
                      'google'],
 'keywords': ['think',
              'teamwo',
              'rk',
              'job',
              'work',
              'team',
              'people',
              'build',
              'python',
              'application',
              'google'],
 'lemmatized_words': ['I',
                      'think',
                      'I',
                      'be',
                      'good',
                      'at',
                      'teamwo',
                      'rk',
                      'because',
                      'in',
                   