In [None]:
pip install gradio transformers sentence-transformers torch PyMuPDF python-docx


Collecting PyMuPDF
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx, PyMuPDF
Successfully installed PyMuPDF-1.26.4 python-docx-1.2.0


In [None]:
# =========================
# AI Interview Chatbot (Improved)
# =========================

import gradio as gr
import json
import random
import fitz  # PyMuPDF for PDF reading
import docx
import torch
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import datetime


In [None]:
import json

interview_data = [
    {
        "role": "Data Engineer",
        "skill": "SQL",
        "question": "Can you explain the difference between INNER JOIN and LEFT JOIN in SQL?",
        "answer": "An INNER JOIN returns only rows with matching values in both tables, while a LEFT JOIN returns all rows from the left table, and matching rows from the right table. If no match exists, NULL is returned."
    },
    {
        "role": "Data Engineer",
        "skill": "Python",
        "question": "How would you handle missing values in a large dataset using Python?",
        "answer": "I would first inspect the missingness pattern, then depending on context, use pandas functions like fillna() for imputation, dropna() to remove rows/columns, or advanced methods like sklearn's SimpleImputer or KNNImputer."
    },
    {
        "role": "Data Engineer",
        "skill": "SQL",
        "question": "What is the difference between UNION and UNION ALL in SQL?",
        "answer": "UNION removes duplicate rows from the result set, while UNION ALL includes all rows, even duplicates, making UNION ALL faster."
    },
    {
        "role": "Data Engineer",
        "skill": "ETL",
        "question": "What are some common ETL tools you’ve used?",
        "answer": "Some popular ETL tools are Apache Airflow, Talend, Informatica, AWS Glue, and custom Python-based ETL pipelines."
    },
    {
        "role": "Data Engineer",
        "skill": "Big Data",
        "question": "Can you explain the difference between Hadoop and Spark?",
        "answer": "Hadoop uses MapReduce for batch processing, which can be slower, while Spark processes data in-memory, supporting both batch and real-time streaming workloads."
    },
    {
        "role": "Data Engineer",
        "skill": "Cloud",
        "question": "How do you set up data pipelines in AWS?",
        "answer": "I use AWS services like S3 for storage, Glue or Lambda for ETL, Redshift for warehousing, and Step Functions or Airflow for orchestration."
    },
    {
        "role": "Data Scientist",
        "skill": "Statistics",
        "question": "What is the difference between Type I and Type II error?",
        "answer": "Type I error is rejecting a true null hypothesis (false positive), while Type II error is failing to reject a false null hypothesis (false negative)."
    },
    {
        "role": "Data Scientist",
        "skill": "Machine Learning",
        "question": "How do you handle class imbalance in datasets?",
        "answer": "Techniques include resampling (oversampling minority, undersampling majority), using SMOTE, adjusting class weights, or using anomaly detection models."
    },
    {
        "role": "Data Scientist",
        "skill": "Model Evaluation",
        "question": "What metrics would you use for a regression model?",
        "answer": "Common metrics include Mean Absolute Error (MAE), Mean Squared Error (MSE), Root Mean Squared Error (RMSE), and R² score."
    },
    {
        "role": "Data Scientist",
        "skill": "Feature Engineering",
        "question": "How do you handle categorical variables in machine learning?",
        "answer": "Options include one-hot encoding, label encoding, frequency encoding, and using embeddings for high-cardinality features."
    },
    {
        "role": "Data Analyst",
        "skill": "Excel",
        "question": "What is the difference between VLOOKUP and INDEX-MATCH?",
        "answer": "VLOOKUP searches only left to right and can be slower, while INDEX-MATCH is more flexible, allowing lookups in any direction."
    },
    {
        "role": "Data Analyst",
        "skill": "Visualization",
        "question": "What are some best practices for designing dashboards?",
        "answer": "Keep it simple, use consistent color schemes, highlight KPIs, avoid clutter, and tailor the dashboard to the audience’s needs."
    },
    {
        "role": "Data Analyst",
        "skill": "SQL",
        "question": "What is a window function in SQL?",
        "answer": "Window functions perform calculations across a set of rows related to the current row, such as ranking, running totals, or moving averages."
    },
    {
        "role": "Machine Learning Engineer",
        "skill": "Deployment",
        "question": "How would you deploy a machine learning model into production?",
        "answer": "I would containerize the model using Docker, expose it via an API (Flask/FastAPI), and orchestrate it using Kubernetes or cloud services like AWS SageMaker."
    },
    {
        "role": "Machine Learning Engineer",
        "skill": "MLOps",
        "question": "What is CI/CD in the context of ML?",
        "answer": "CI/CD in ML involves automating model training, testing, and deployment pipelines so that models can be updated reliably and continuously."
    },
    {
        "role": "Data Engineer",
        "skill": "Python",
        "question": "How do you optimize Pandas operations for large datasets?",
        "answer": "I use vectorized operations, chunking, Dask for parallel computing, and push heavy operations to databases instead of doing them in Pandas."
    },
    {
        "role": "Data Scientist",
        "skill": "Deep Learning",
        "question": "What is the difference between CNN and RNN?",
        "answer": "CNNs are designed for spatial data like images, while RNNs are designed for sequential data like time series or text."
    },
    {
        "role": "Data Engineer",
        "skill": "SQL",
        "question": "What is the difference between UNION and UNION ALL in SQL?",
        "answer": "UNION removes duplicates, while UNION ALL keeps all rows including duplicates."
    },
    {
        "role": "Data Engineer",
        "skill": "SQL",
        "question": "What is a primary key and foreign key in SQL?",
        "answer": "A primary key uniquely identifies a row in a table, while a foreign key is a reference to a primary key in another table."
    },
    {
        "role": "Data Engineer",
        "skill": "SQL",
        "question": "Explain the difference between clustered and non-clustered indexes.",
        "answer": "A clustered index defines the physical order of rows in a table, while a non-clustered index creates a separate structure that points to the data rows."
    },
    {
        "role": "Data Engineer",
        "skill": "SQL",
        "question": "What is normalization in databases?",
        "answer": "Normalization is the process of organizing data to reduce redundancy and improve data integrity through normal forms."
    },
    {
        "role": "Data Engineer",
        "skill": "ETL",
        "question": "What is the purpose of an ETL pipeline?",
        "answer": "ETL (Extract, Transform, Load) pipelines move data from source systems, transform it into usable formats, and load it into a target database or warehouse."
    },
    {
        "role": "Data Engineer",
        "skill": "ETL",
        "question": "What is the difference between batch and streaming ETL?",
        "answer": "Batch ETL processes large volumes of data at intervals, while streaming ETL processes data in real-time as it arrives."
    },
    {
        "role": "Data Engineer",
        "skill": "Big Data",
        "question": "What are the main components of Hadoop?",
        "answer": "Hadoop consists of HDFS (storage), YARN (resource management), and MapReduce (processing)."
    },
    {
        "role": "Data Engineer",
        "skill": "Big Data",
        "question": "What is the difference between Spark and MapReduce?",
        "answer": "MapReduce writes intermediate results to disk, while Spark keeps data in memory, making it faster and suitable for iterative algorithms."
    },
    {
        "role": "Data Engineer",
        "skill": "Cloud",
        "question": "What is the difference between AWS Redshift and Snowflake?",
        "answer": "Redshift is AWS’s managed data warehouse tightly integrated with AWS ecosystem, while Snowflake is a cloud-agnostic data warehouse with automatic scaling."
    },
    {
        "role": "Data Engineer",
        "skill": "Python",
        "question": "How do you read a large CSV file efficiently in Python?",
        "answer": "Use chunking with pandas read_csv(chunksize), or tools like Dask or PySpark for distributed reading."
    },
    {
        "role": "Data Scientist",
        "skill": "Statistics",
        "question": "What is the difference between correlation and causation?",
        "answer": "Correlation indicates a statistical relationship between variables, but causation implies one variable directly affects another."
    },
    {
        "role": "Data Scientist",
        "skill": "Statistics",
        "question": "What is p-value in hypothesis testing?",
        "answer": "The p-value is the probability of observing results at least as extreme as the current ones, assuming the null hypothesis is true."
    },
    {
        "role": "Data Scientist",
        "skill": "Machine Learning",
        "question": "What is overfitting in machine learning?",
        "answer": "Overfitting happens when a model learns noise in the training data, leading to poor generalization on unseen data."
    },
    {
        "role": "Data Scientist",
        "skill": "Machine Learning",
        "question": "What is the bias-variance tradeoff?",
        "answer": "Bias measures error from overly simplistic models, while variance measures error from overly complex models. The tradeoff balances both."
    },
    {
        "role": "Data Scientist",
        "skill": "Model Evaluation",
        "question": "When would you use precision over recall?",
        "answer": "When false positives are more costly than false negatives, precision is prioritized. Example: detecting spam emails."
    },
    {
        "role": "Data Scientist",
        "skill": "Feature Engineering",
        "question": "What is feature scaling and why is it important?",
        "answer": "Feature scaling (normalization or standardization) ensures all features contribute equally, improving convergence for models like gradient descent."
    },
    {
        "role": "Data Scientist",
        "skill": "Feature Engineering",
        "question": "What is dimensionality reduction?",
        "answer": "Dimensionality reduction techniques like PCA reduce the number of features while retaining most of the variance in the data."
    },
    {
        "role": "Data Analyst",
        "skill": "Excel",
        "question": "What is the difference between absolute and relative references in Excel?",
        "answer": "Relative references change when copied, while absolute references (with $) stay fixed."
    },
    {
        "role": "Data Analyst",
        "skill": "Visualization",
        "question": "What is the difference between bar chart and histogram?",
        "answer": "A bar chart compares categories, while a histogram shows the frequency distribution of continuous data."
    },
    {
        "role": "Data Analyst",
        "skill": "SQL",
        "question": "What is a subquery in SQL?",
        "answer": "A subquery is a query nested inside another query, used to filter or compute intermediate results."
    },
    {
        "role": "Data Engineer",
        "skill": "SQL",
        "question": "What is a stored procedure in SQL?",
        "answer": "A stored procedure is a precompiled set of SQL statements stored in the database, which can be executed with parameters to improve reusability and performance."
    },
    {
        "role": "Data Engineer",
        "skill": "SQL",
        "question": "What is the difference between DELETE and TRUNCATE in SQL?",
        "answer": "DELETE removes rows one at a time and can have conditions, while TRUNCATE quickly removes all rows without conditions and resets table identity."
    },
    {
        "role": "Data Engineer",
        "skill": "ETL",
        "question": "What is data lineage?",
        "answer": "Data lineage tracks the flow of data from source to destination, showing how it was transformed along the way for transparency and debugging."
    },
    {
        "role": "Data Engineer",
        "skill": "ETL",
        "question": "How do you handle schema changes in ETL pipelines?",
        "answer": "Use schema evolution features in tools like Spark or Glue, version schemas, and implement error handling for backward compatibility."
    },
    {
        "role": "Data Engineer",
        "skill": "Big Data",
        "question": "What is the CAP theorem?",
        "answer": "The CAP theorem states that a distributed system can provide only two of the three: Consistency, Availability, and Partition Tolerance."
    },
    {
        "role": "Data Engineer",
        "skill": "Cloud",
        "question": "What is a data lake?",
        "answer": "A data lake is a centralized storage repository that holds structured, semi-structured, and unstructured data in raw format, often in cloud storage like S3."
    },
    {
        "role": "Data Engineer",
        "skill": "Python",
        "question": "What is the difference between list, tuple, and set in Python?",
        "answer": "Lists are ordered and mutable, tuples are ordered and immutable, and sets are unordered collections of unique elements."
    },
    {
        "role": "Data Scientist",
        "skill": "Statistics",
        "question": "What is the central limit theorem?",
        "answer": "It states that the distribution of sample means approaches a normal distribution as the sample size grows, regardless of population distribution."
    },
    {
        "role": "Data Scientist",
        "skill": "Statistics",
        "question": "What is multicollinearity?",
        "answer": "Multicollinearity occurs when independent variables in a regression model are highly correlated, making coefficient estimates unstable."
    },
    {
        "role": "Data Scientist",
        "skill": "Machine Learning",
        "question": "What is the difference between supervised and unsupervised learning?",
        "answer": "Supervised learning uses labeled data to train models, while unsupervised learning finds patterns or clusters in unlabeled data."
    },
    {
        "role": "Data Scientist",
        "skill": "Machine Learning",
        "question": "What is cross-validation?",
        "answer": "Cross-validation splits the dataset into multiple folds to train and test the model on different subsets, improving reliability of evaluation."
    },
    {
        "role": "Data Scientist",
        "skill": "Model Evaluation",
        "question": "What is ROC-AUC?",
        "answer": "ROC-AUC measures the area under the ROC curve, representing the trade-off between true positive rate and false positive rate across thresholds."
    },
    {
        "role": "Data Scientist",
        "skill": "Feature Engineering",
        "question": "What is one-hot encoding?",
        "answer": "One-hot encoding represents categorical variables as binary vectors, with 1 indicating the presence of a category."
    },
    {
        "role": "Data Scientist",
        "skill": "Feature Engineering",
        "question": "What is feature selection and why is it important?",
        "answer": "Feature selection chooses the most relevant features, reducing overfitting, improving model performance, and lowering computational cost."
    },
    {
        "role": "Data Analyst",
        "skill": "Excel",
        "question": "What are pivot tables used for in Excel?",
        "answer": "Pivot tables are used to summarize, analyze, and present data dynamically by grouping, filtering, and aggregating."
    },
    {
        "role": "Data Analyst",
        "skill": "Visualization",
        "question": "What are KPIs and how do you visualize them?",
        "answer": "KPIs are key performance indicators. They can be visualized with gauges, scorecards, or simple trend charts on dashboards."
    },
    {
        "role": "Data Analyst",
        "skill": "SQL",
        "question": "What is an index in SQL and why is it used?",
        "answer": "An index speeds up query performance by allowing the database to find rows faster, but it adds overhead for inserts and updates."
    },
    {
        "role": "ML Engineer",
        "skill": "Deployment",
        "question": "What is model drift?",
        "answer": "Model drift occurs when the statistical properties of input data change over time, reducing the accuracy of the model."
    },
    {
        "role": "ML Engineer",
        "skill": "MLOps",
        "question": "What tools are commonly used in MLOps?",
        "answer": "Popular MLOps tools include MLflow, Kubeflow, Airflow, TensorFlow Extended (TFX), and cloud-native ML services."
    },
    {
        "role": "ML Engineer",
        "skill": "Optimization",
        "question": "What is gradient descent?",
        "answer": "Gradient descent is an optimization algorithm that updates model parameters by moving in the opposite direction of the gradient to minimize loss."
    },
    {
        "role": "Data Engineer",
        "skill": "SQL",
        "question": "What is the difference between HAVING and WHERE in SQL?",
        "answer": "WHERE filters rows before aggregation, while HAVING filters groups after aggregation."
    },
    {
        "role": "Data Engineer",
        "skill": "SQL",
        "question": "What is a CTE (Common Table Expression) in SQL?",
        "answer": "A CTE is a temporary result set defined using WITH that can be referenced within a query, making queries easier to read and maintain."
    },
    {
        "role": "Data Engineer",
        "skill": "ETL",
        "question": "What are slowly changing dimensions (SCD) in data warehousing?",
        "answer": "SCD refers to how historical data is managed when dimension data changes, with common types being SCD Type 1 (overwrite), Type 2 (add new row), and Type 3 (add new column)."
    },
    {
        "role": "Data Engineer",
        "skill": "ETL",
        "question": "How would you design an ETL pipeline for high scalability?",
        "answer": "I would use distributed systems like Spark, design modular pipelines, use cloud storage, and implement orchestration with Airflow or Prefect."
    },
    {
        "role": "Data Engineer",
        "skill": "Big Data",
        "question": "What is partitioning in Spark?",
        "answer": "Partitioning divides data into smaller chunks that can be processed in parallel across a cluster, improving efficiency."
    },
    {
        "role": "Data Engineer",
        "skill": "Big Data",
        "question": "What is the difference between HDFS and S3?",
        "answer": "HDFS is an on-prem distributed file system, while S3 is a cloud-based object storage with unlimited scalability and pay-as-you-go pricing."
    },
    {
        "role": "Data Engineer",
        "skill": "Cloud",
        "question": "What is serverless computing?",
        "answer": "Serverless computing allows you to run code without managing servers, e.g., AWS Lambda or Azure Functions, charging only for execution time."
    },
    {
        "role": "Data Engineer",
        "skill": "Python",
        "question": "What is the difference between shallow copy and deep copy in Python?",
        "answer": "A shallow copy copies references to objects, while a deep copy creates entirely new independent objects."
    },
    {
        "role": "Data Scientist",
        "skill": "Statistics",
        "question": "What is heteroscedasticity?",
        "answer": "Heteroscedasticity occurs when the variance of errors in a regression model is not constant, violating regression assumptions."
    },
    {
        "role": "Data Scientist",
        "skill": "Statistics",
        "question": "What is the difference between parametric and non-parametric tests?",
        "answer": "Parametric tests assume a specific distribution (e.g., t-test), while non-parametric tests do not (e.g., Mann-Whitney U test)."
    },
    {
        "role": "Data Scientist",
        "skill": "Machine Learning",
        "question": "What is regularization in machine learning?",
        "answer": "Regularization adds a penalty to model complexity (L1 or L2) to prevent overfitting."
    },
    {
        "role": "Data Scientist",
        "skill": "Machine Learning",
        "question": "What is the difference between bagging and boosting?",
        "answer": "Bagging trains models in parallel on different subsets of data and averages results, while boosting trains models sequentially, correcting errors from previous ones."
    },
    {
        "role": "Data Scientist",
        "skill": "Model Evaluation",
        "question": "What is confusion matrix in classification?",
        "answer": "A confusion matrix is a table showing true positives, true negatives, false positives, and false negatives, used to evaluate classification performance."
    },
    {
        "role": "Data Scientist",
        "skill": "Feature Engineering",
        "question": "What is feature hashing?",
        "answer": "Feature hashing converts categorical variables into numeric vectors using a hash function, reducing memory usage for high-cardinality data."
    },
    {
        "role": "Data Analyst",
        "skill": "Excel",
        "question": "What is conditional formatting in Excel?",
        "answer": "Conditional formatting changes cell appearance based on rules, e.g., highlighting values above a threshold."
    },
    {
        "role": "Data Analyst",
        "skill": "Visualization",
        "question": "What is the difference between heatmap and scatter plot?",
        "answer": "A heatmap shows values with colors in a matrix format, while a scatter plot shows relationships between two variables with points."
    },
    {
        "role": "Data Analyst",
        "skill": "SQL",
        "question": "What is a self-join in SQL?",
        "answer": "A self-join is when a table is joined with itself, often to compare rows within the same table."
    },
    {
        "role": "ML Engineer",
        "skill": "Deployment",
        "question": "What is a REST API and how is it used in ML deployment?",
        "answer": "A REST API exposes model predictions as endpoints, allowing applications to send requests and get predictions back."
    },
    {
        "role": "ML Engineer",
        "skill": "MLOps",
        "question": "What is continuous training (CT) in MLOps?",
        "answer": "Continuous training retrains models automatically when new data arrives, ensuring models stay updated with data drift."
    },
    {
        "role": "ML Engineer",
        "skill": "Optimization",
        "question": "What is early stopping in training?",
        "answer": "Early stopping halts training when validation performance stops improving, preventing overfitting."
    },
    {
        "role": "Data Engineer",
        "skill": "SQL",
        "question": "What is the difference between OLTP and OLAP databases?",
        "answer": "OLTP databases are optimized for fast transaction processing, while OLAP databases are optimized for complex queries and analytics."
    },
    {
        "role": "Data Engineer",
        "skill": "SQL",
        "question": "What are window functions used for in SQL?",
        "answer": "Window functions perform calculations across a set of rows related to the current row, such as ranking, running totals, or moving averages."
    },
    {
        "role": "Data Engineer",
        "skill": "ETL",
        "question": "What is the difference between data lake and data warehouse?",
        "answer": "A data lake stores raw data in any format, while a data warehouse stores structured, cleaned, and curated data for analysis."
    },
    {
        "role": "Data Engineer",
        "skill": "ETL",
        "question": "How do you handle duplicate data in ETL pipelines?",
        "answer": "I use deduplication strategies such as DISTINCT queries, primary key constraints, and data quality checks during transformation."
    },
    {
        "role": "Data Engineer",
        "skill": "Big Data",
        "question": "What is the difference between batch processing and real-time streaming?",
        "answer": "Batch processing handles large datasets at scheduled intervals, while streaming processes data continuously as it arrives."
    },
    {
        "role": "Data Engineer",
        "skill": "Big Data",
        "question": "What is the role of Apache Kafka in data engineering?",
        "answer": "Kafka is a distributed messaging system used for building real-time data pipelines and streaming applications."
    },
    {
        "role": "Data Engineer",
        "skill": "Cloud",
        "question": "What is the difference between IaaS, PaaS, and SaaS?",
        "answer": "IaaS provides infrastructure, PaaS provides a platform for development, and SaaS delivers ready-to-use applications over the cloud."
    },
    {
        "role": "Data Engineer",
        "skill": "Python",
        "question": "What are Python decorators used for?",
        "answer": "Decorators modify the behavior of functions or classes without changing their source code, often used for logging or authentication."
    },
    {
        "role": "Data Scientist",
        "skill": "Statistics",
        "question": "What is the difference between mean, median, and mode?",
        "answer": "Mean is the average, median is the middle value, and mode is the most frequent value in a dataset."
    },
    {
        "role": "Data Scientist",
        "skill": "Statistics",
        "question": "What is hypothesis testing?",
        "answer": "Hypothesis testing evaluates assumptions about a population parameter using sample data, typically involving null and alternative hypotheses."
    },
    {
        "role": "Data Scientist",
        "skill": "Machine Learning",
        "question": "What is transfer learning?",
        "answer": "Transfer learning uses a pre-trained model on one task as a starting point for training on a related task, saving time and data."
    },
    {
        "role": "Data Scientist",
        "skill": "Machine Learning",
        "question": "What is ensemble learning?",
        "answer": "Ensemble learning combines multiple models (e.g., bagging, boosting, stacking) to achieve better performance than individual models."
    },
    {
        "role": "Data Scientist",
        "skill": "Model Evaluation",
        "question": "When would you use F1 score instead of accuracy?",
        "answer": "F1 score is used when dealing with imbalanced datasets, as it balances precision and recall better than accuracy."
    },
    {
        "role": "Data Scientist",
        "skill": "Feature Engineering",
        "question": "What is polynomial feature expansion?",
        "answer": "Polynomial expansion creates new features by raising existing features to higher powers, capturing non-linear relationships."
    },
    {
        "role": "Data Analyst",
        "skill": "Excel",
        "question": "What are array formulas in Excel?",
        "answer": "Array formulas perform multiple calculations on a range of values, returning either a single or multiple results."
    },
    {
        "role": "Data Analyst",
        "skill": "Visualization",
        "question": "What is the difference between descriptive and diagnostic analytics?",
        "answer": "Descriptive analytics explains what happened, while diagnostic analytics explains why it happened."
    },
    {
        "role": "Data Analyst",
        "skill": "SQL",
        "question": "What is a materialized view?",
        "answer": "A materialized view stores the results of a query physically, unlike a regular view which is just a virtual query reference."
    },
    {
        "role": "ML Engineer",
        "skill": "Deployment",
        "question": "What is model versioning?",
        "answer": "Model versioning tracks changes to machine learning models over time, ensuring reproducibility and rollback if needed."
    },
    {
        "role": "ML Engineer",
        "skill": "MLOps",
        "question": "What is data drift and how do you detect it?",
        "answer": "Data drift occurs when the input data distribution changes over time. It can be detected using statistical tests or monitoring model performance."
    },
    {
        "role": "ML Engineer",
        "skill": "Optimization",
        "question": "What is hyperparameter tuning?",
        "answer": "Hyperparameter tuning is the process of finding the best model configuration using methods like grid search, random search, or Bayesian optimization."
    },
    {
        "role": "Data Engineer",
        "skill": "SQL",
        "question": "What is a surrogate key in databases?",
        "answer": "A surrogate key is an artificial key, usually an auto-incremented ID, used to uniquely identify a record when a natural key is not practical."
    },
    {
        "role": "Data Engineer",
        "skill": "SQL",
        "question": "What is the difference between INNER JOIN and OUTER JOIN?",
        "answer": "INNER JOIN returns rows with matches in both tables, while OUTER JOIN also includes unmatched rows with NULLs."
    },
    {
        "role": "Data Engineer",
        "skill": "ETL",
        "question": "What is idempotency in ETL pipelines?",
        "answer": "Idempotency means running the same ETL job multiple times produces the same result, preventing duplicate or inconsistent data."
    },
    {
        "role": "Data Engineer",
        "skill": "ETL",
        "question": "What is data orchestration?",
        "answer": "Data orchestration is the automation of data workflows, scheduling, and dependencies using tools like Airflow or Prefect."
    },
    {
        "role": "Data Engineer",
        "skill": "Big Data",
        "question": "What is shuffling in Spark?",
        "answer": "Shuffling is the process of redistributing data across partitions, often triggered by operations like groupBy or join."
    },
    {
        "role": "Data Engineer",
        "skill": "Big Data",
        "question": "What is schema-on-read vs schema-on-write?",
        "answer": "Schema-on-write enforces schema before storing data (data warehouses), while schema-on-read applies schema at query time (data lakes)."
    },
    {
        "role": "Data Engineer",
        "skill": "Cloud",
        "question": "What is autoscaling in cloud services?",
        "answer": "Autoscaling automatically adjusts computing resources based on workload demand, optimizing cost and performance."
    },
    {
        "role": "Data Engineer",
        "skill": "Python",
        "question": "What is a generator in Python?",
        "answer": "A generator is a function that yields values one at a time using 'yield', useful for memory-efficient iteration over large datasets."
    },
    {
        "role": "Data Scientist",
        "skill": "Statistics",
        "question": "What is the difference between population and sample?",
        "answer": "Population is the entire group under study, while a sample is a subset used to make inferences about the population."
    },
    {
        "role": "Data Scientist",
        "skill": "Statistics",
        "question": "What is Bayesian inference?",
        "answer": "Bayesian inference updates the probability of a hypothesis as more evidence becomes available, based on Bayes’ theorem."
    },
    {
        "role": "Data Scientist",
        "skill": "Machine Learning",
        "question": "What is reinforcement learning?",
        "answer": "Reinforcement learning is a type of ML where an agent learns by interacting with an environment and receiving rewards or penalties."
    },
    {
        "role": "Data Scientist",
        "skill": "Machine Learning",
        "question": "What is the curse of dimensionality?",
        "answer": "It refers to problems that arise when data has too many features, making distance measures less meaningful and models prone to overfitting."
    },
    {
        "role": "Data Scientist",
        "skill": "Model Evaluation",
        "question": "What is stratified sampling in model evaluation?",
        "answer": "Stratified sampling ensures that training and test splits preserve the same class distribution as the original dataset."
    },
    {
        "role": "Data Scientist",
        "skill": "Feature Engineering",
        "question": "What is target encoding?",
        "answer": "Target encoding replaces categorical values with the mean of the target variable for that category, useful for high-cardinality features."
    },
    {
        "role": "Data Analyst",
        "skill": "Excel",
        "question": "What is Power Query in Excel?",
        "answer": "Power Query is a data transformation tool that allows importing, cleaning, and reshaping data before analysis."
    },
    {
        "role": "Data Analyst",
        "skill": "Visualization",
        "question": "What is the difference between correlation and causation in dashboards?",
        "answer": "Correlation shows a statistical relationship between variables, but causation means one variable directly influences the other. Dashboards should not imply causation without proof."
    },
    {
        "role": "Data Analyst",
        "skill": "SQL",
        "question": "What is a recursive query in SQL?",
        "answer": "A recursive query is one that refers to itself, often using a CTE, to handle hierarchical or tree-structured data."
    },
    {
        "role": "ML Engineer",
        "skill": "Deployment",
        "question": "What is containerization and why is it useful for ML models?",
        "answer": "Containerization packages an ML model and its dependencies into a portable unit (e.g., Docker), ensuring consistency across environments."
    },
    {
        "role": "ML Engineer",
        "skill": "MLOps",
        "question": "What is a feature store in MLOps?",
        "answer": "A feature store is a centralized repository for storing, sharing, and serving features consistently for training and inference."
    },
    {
        "role": "ML Engineer",
        "skill": "Optimization",
        "question": "What is gradient clipping?",
        "answer": "Gradient clipping limits the size of gradients during training to prevent explodi  ng gradients, stabilizing deep learning optimization."
    }


]

with open("interview_data.json", "w") as f:
    json.dump(interview_data, f, indent=4)

print("✅ JSON file created and saved in Colab!")


✅ JSON file created and saved in Colab!


In [None]:
type(interview_data)


list

In [None]:
# =========================
# Load Question Dataset
# =========================
#with open("interview_dataset.json", "r") as f:
    #interview_data = json.load(f)

import json

interview_data = json.loads(interview_data)


TypeError: the JSON object must be str, bytes or bytearray, not list

In [None]:
# =========================
# Load NLP Models
# =========================

# For answer feedback (better than distilgpt2)
chatbot_model = pipeline("text2text-generation", model="google/flan-t5-base")

# For semantic similarity in question retrieval
embedder = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# =========================
# File Text Extraction
# =========================
def extract_text_from_pdf(pdf_path, max_chars=5000):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text[:max_chars]

def extract_text_from_docx(docx_path, max_chars=5000):
    doc = docx.Document(docx_path)
    text = "\n".join([p.text for p in doc.paragraphs])
    return text[:max_chars]

def extract_text(file_path):
    if file_path.endswith(".pdf"):
        return extract_text_from_pdf(file_path)
    elif file_path.endswith(".docx"):
        return extract_text_from_docx(file_path)
    else:
        return ""

In [None]:
# =========================
# Semantic Question Retrieval
# =========================
def retrieve_questions(cv_text, jd_text, top_k=5):
    """Find semantically relevant interview questions based on CV + JD"""
    query = cv_text + " " + jd_text
    query_emb = embedder.encode(query, convert_to_tensor=True)

    question_texts = [
        f"{item['role']} {item['skill']} {item['question']}" for item in interview_data
    ]
    question_embs = embedder.encode(question_texts, convert_to_tensor=True)

    hits = util.semantic_search(query_emb, question_embs, top_k=top_k)[0]
    results = [interview_data[hit["corpus_id"]] for hit in hits]
    return results

In [None]:
# =========================
# Feedback Generation
# =========================
def give_feedback(answer, question):
    """Generate concise feedback using a T5-based model"""
    prompt = f"Question: {question}\nCandidate Answer: {answer}\nProvide constructive feedback on the answer:"
    result = chatbot_model(prompt, max_length=100, truncation=True)
    return result[0]["generated_text"].strip()

In [None]:
# =========================
# Session Management
# =========================
interview_sessions = {}

def start_interview(questions):
    """Start a new interview session"""
    if not questions:
        return "⚠️ No relevant questions found. Please upload a CV and JD first."
    interview_sessions["active"] = {"questions": questions, "current": 0, "answers": []}
    first_question = questions[0]
    return f"🎤 Starting your personalized mock interview.\n\n**Question 1:** {first_question}"

def chat_with_bot(message, history):
    """Handle user answers and provide feedback + next question"""
    if "active" not in interview_sessions:
        return "⚠️ Please start the mock interview first."

    session = interview_sessions["active"]
    q_index = session["current"]
    questions = session["questions"]

    # Save answer
    session["answers"].append({"question": questions[q_index], "answer": message})

    # Generate feedback
    feedback = give_feedback(message, questions[q_index])

    # Move to next question
    session["current"] += 1
    if session["current"] < len(questions):
        next_question = questions[session["current"]]
        return f"💡 Feedback: {feedback}\n\n**Next Question {session['current']+1}:** {next_question}"
    else:
        # End session and log
        save_session_log(session["questions"], session["answers"])
        del interview_sessions["active"]
        return f"💡 Final Feedback: {feedback}\n\n✅ Interview completed. Great job!"

In [None]:
# =========================
# Save Session Logs
# =========================
def save_session_log(questions, answers):
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    log = {"questions": questions, "answers": answers}
    with open(f"interview_log_{timestamp}.json", "w") as f:
        json.dump(log, f, indent=4)


In [None]:
# =========================
# File Upload + Question Retrieval
# =========================
def upload_and_generate(cv_file, jd_file):
    if not cv_file or not jd_file:
        return "⚠️ Please upload both a CV and a Job Description."

    cv_text = extract_text(cv_file.name)
    jd_text = extract_text(jd_file.name)

    relevant_questions = retrieve_questions(cv_text, jd_text)
    question_texts = [f"{q['question']} (Role: {q['role']}, Skill: {q['skill']})" for q in relevant_questions]

    return "\n".join(question_texts)

In [None]:
# =========================
# Gradio UI
# =========================
with gr.Blocks(theme=gr.themes.Soft()) as app:
    gr.Markdown("# 🧠 AI Interview Chatbot\nUpload your CV & JD → Generate Smart Questions → Practice Live")

    with gr.Tab("1️⃣ Upload & Generate Questions"):
        cv_input = gr.File(label="Upload CV (.pdf / .docx)")
        jd_input = gr.File(label="Upload Job Description (.pdf / .docx)")
        output_box = gr.Textbox(label="Relevant Interview Questions", lines=10)
        generate_btn = gr.Button("Generate Interview Questions")
        generate_btn.click(upload_and_generate, inputs=[cv_input, jd_input], outputs=output_box)

    with gr.Tab("2️⃣ Mock Interview"):
        question_list = gr.Textbox(label="Paste the questions you want to practice", lines=5)
        start_btn = gr.Button("Start Interview")
        chatbot = gr.ChatInterface(fn=chat_with_bot, title="🗣️ Mock Interview Assistant")
        start_btn.click(start_interview, inputs=question_list, outputs=chatbot)


  self.chatbot = Chatbot(


In [None]:
# =========================
# Run App
# =========================
if __name__ == "__main__":
    app.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://38523c2debde4c7e71.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
