## *1) NUMERIC FEATURES PIPELINE*

In [None]:
# ========================================
# NUMERIC FEATURES PIPELINE
# ========================================
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.manifold import TSNE
import umap.umap_ as umap




In [None]:
# ========================================
# 1. Load Dataset & EDA
# ========================================
df = pd.read_csv("glassdoor_job_reviews.csv")   # path to Kaggle dataset

# Numeric features
numeric_cols = ["rating_overall","career_opportunity","compensation",
                "culture_values","senior_management","work_life_balance"]

X = df[numeric_cols].fillna(df[numeric_cols].mean())
y = df["label"].map({"Negative":0,"No Opinion":1,"Positive":2})

# EDA plots
plt.figure(figsize=(8,5))
sns.countplot(x=y)
plt.title("Label Distribution")
plt.show()

sns.heatmap(X.corr(),annot=True,cmap="Blues")
plt.title("Correlation Heatmap of Numeric Features")
plt.show()

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train,X_val,y_train,y_val = train_test_split(X_scaled,y,test_size=0.2,random_state=42)


In [None]:
# ========================================
# 2. Numeric Baseline Models (LSTM/BiLSTM)
# ========================================
class LSTMModel(nn.Module):
    def __init__(self,input_dim,hidden_dim,num_classes):
        super().__init__()
        self.lstm = nn.LSTM(input_dim,hidden_dim,batch_first=True)
        self.fc = nn.Linear(hidden_dim,num_classes)
    def forward(self,x):
        out,_ = self.lstm(x)
        out = self.fc(out[:,-1,:])
        return out

class BiLSTMModel(nn.Module):
    def __init__(self,input_dim,hidden_dim,num_classes):
        super().__init__()
        self.lstm = nn.LSTM(input_dim,hidden_dim,bidirectional=True,batch_first=True)
        self.fc = nn.Linear(hidden_dim*2,num_classes)
    def forward(self,x):
        out,_ = self.lstm(x)
        out = self.fc(out[:,-1,:])
        return out



In [None]:
# ========================================
# 3. Train & Eval Function
# ========================================
def train_model(model,train_loader,val_loader,epochs=10,lr=0.001):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    crit = nn.CrossEntropyLoss()
    opt = optim.Adam(model.parameters(),lr=lr)

    train_losses,val_losses=[],[]

    for ep in range(epochs):
        model.train(); loss_sum=0
        for xb,yb in train_loader:
            xb,yb = xb.to(device), yb.to(device)
            opt.zero_grad()
            out = model(xb)
            loss = crit(out,yb)
            loss.backward(); opt.step()
            loss_sum += loss.item()
        train_losses.append(loss_sum/len(train_loader))

        model.eval(); vloss=0
        with torch.no_grad():
            for xb,yb in val_loader:
                xb,yb = xb.to(device), yb.to(device)
                out = model(xb); loss=crit(out,yb); vloss+=loss.item()
        val_losses.append(vloss/len(val_loader))
        print(f"Epoch {ep+1}: Train {train_losses[-1]:.4f}, Val {val_losses[-1]:.4f}")
    return train_losses,val_losses

# Prepare DataLoaders
train_loader = DataLoader(TensorDataset(torch.tensor(X_train).float().unsqueeze(1),
                                        torch.tensor(y_train.values)),batch_size=32,shuffle=True)
val_loader = DataLoader(TensorDataset(torch.tensor(X_val).float().unsqueeze(1),
                                      torch.tensor(y_val.values)),batch_size=32)

# Train BiLSTM
bilstm = BiLSTMModel(len(numeric_cols),64,3)
train_losses,val_losses = train_model(bilstm,train_loader,val_loader,epochs=5)
lt.title("UMAP (Numeric)"); plt.show()


In [None]:

# ========================================
# 4. Evaluation & Visualization
# ========================================
bilstm.eval()
y_pred,y_true=[],[]
with torch.no_grad():
    for xb,yb in val_loader:
        out = bilstm(xb)
        preds = torch.argmax(out,dim=1)
        y_pred.extend(preds.numpy()); y_true.extend(yb.numpy())

print(classification_report(y_true,y_pred))
cm = confusion_matrix(y_true,y_pred)
sns.heatmap(cm,annot=True,fmt="d",cmap="Blues")
plt.title("Confusion Matrix - Numeric BiLSTM")
plt.show()

# Accuracy/Loss curve
plt.plot(train_losses,label="Train Loss"); plt.plot(val_losses,label="Val Loss")
plt.legend(); plt.title("Loss Curve - BiLSTM"); plt.show()

# Embedding visualization (TSNE, UMAP)
embeddings = bilstm(torch.tensor(X_val).float().unsqueeze(1)).detach().numpy()
tsne = TSNE(n_components=2).fit_transform(embeddings)
umap_emb = umap.UMAP().fit_transform(embeddings)

plt.scatter(tsne[:,0],tsne[:,1],c=y_val,cmap="viridis"); plt.title("t-SNE (Numeric)"); plt.show()
plt.scatter(umap_emb[:,0],umap_emb[:,1],c=y_val,cmap="plasma"); p

In [None]:
# ========================================
# 1. Load Libraries
# ========================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.manifold import TSNE
import umap.umap_ as umap

from torch_geometric.nn import GCNConv, SAGEConv, HANConv
from torch_geometric.data import Data

# ========================================
# 2. Load Dataset
# ========================================
df = pd.read_csv("glassdoor_reviews.csv")

numeric_cols = ["Career Opportunities","Comp & Benefits","Culture & Values",
                "Senior Management","Work/Life Balance","Rating"]

df = df[numeric_cols].dropna()

# Map labels from overall Rating
def map_sentiment(r):
    if r <= 2: return "Negative"
    elif r == 3: return "No Opinion"
    else: return "Positive"

df["Label"] = df["Rating"].apply(map_sentiment)

# ========================================
# 3. Preprocessing
# ========================================
X = df.drop(columns=["Rating","Label"])
y = df["Label"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

le = LabelEncoder()
y_enc = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_enc, test_size=0.2, random_state=42)

X_train_t = torch.tensor(X_train, dtype=torch.float32)
X_test_t = torch.tensor(X_test, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.long)
y_test_t = torch.tensor(y_test, dtype=torch.long)

train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=32, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test_t, y_test_t), batch_size=32)

# ========================================
# 4. Baseline Models (LSTM, BiLSTM)
# ========================================
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, bi=False):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=bi)
        self.fc = nn.Linear(hidden_dim*(2 if bi else 1), output_dim)
    def forward(self, x):
        x = x.unsqueeze(1)  # seq length=1
        _, (h, _) = self.lstm(x)
        if self.lstm.bidirectional:
            h = torch.cat((h[-2], h[-1]), dim=1)
        else:
            h = h[-1]
        return self.fc(h)

def train_model(model, train_loader, test_loader, epochs=10):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    history = {"train_loss":[],"val_loss":[]}

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for xb, yb in train_loader:
            optimizer.zero_grad()
            out = model(xb)
            loss = criterion(out, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        history["train_loss"].append(total_loss/len(train_loader))

        model.eval()
        val_loss=0
        with torch.no_grad():
            for xb,yb in test_loader:
                out=model(xb)
                val_loss+=criterion(out,yb).item()
        history["val_loss"].append(val_loss/len(test_loader))
        print(f"Epoch {epoch+1}: Train Loss={history['train_loss'][-1]:.4f} Val Loss={history['val_loss'][-1]:.4f}")
    return history

# Train LSTM
lstm_model = LSTMModel(input_dim=X_train.shape[1], hidden_dim=32, output_dim=3)
train_model(lstm_model, train_loader, test_loader, epochs=5)

# Train BiLSTM
bilstm_model = LSTMModel(input_dim=X_train.shape[1], hidden_dim=32, output_dim=3, bi=True)
train_model(bilstm_model, train_loader, test_loader, epochs=5)

# ========================================
# 5. Transformer Baselines (BERT, RoBERTa, DistilBERT)
# ========================================
# Numeric features must be tokenized into pseudo-text or fed via tab-transformer
# Here: simulate using MLP baseline instead
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    def forward(self,x):
        return self.fc2(torch.relu(self.fc1(x)))

mlp_model = MLP(X_train.shape[1], 64, 3)
train_model(mlp_model, train_loader, test_loader, epochs=5)

# ========================================
# 6. Proposed GNN
# ========================================
# Build simple kNN graph from numeric features
from sklearn.neighbors import kneighbors_graph
adj = kneighbors_graph(X_scaled, n_neighbors=5).tocoo()
edge_index = torch.tensor([adj.row, adj.col], dtype=torch.long)

x = torch.tensor(X_scaled, dtype=torch.float32)
y_all = torch.tensor(y_enc, dtype=torch.long)

class GNN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

gnn_model = GNN(X_train.shape[1], 32, 3)
optimizer = optim.Adam(gnn_model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(10):
    gnn_model.train()
    optimizer.zero_grad()
    out = gnn_model(x, edge_index)
    loss = criterion(out[:len(X_train)], y_train_t)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss={loss.item():.4f}")

# ========================================
# 7. Evaluation & Confusion Matrix
# ========================================
gnn_model.eval()
pred = out[len(X_train):].argmax(dim=1)
print(classification_report(y_test, pred, target_names=le.classes_))

cm = confusion_matrix(y_test, pred)
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=le.classes_, yticklabels=le.classes_)
plt.show()

# ========================================
# 8. Training/Validation Curves
# (example from earlier history dictionary)
# ========================================
plt.plot(history["train_loss"], label="Train Loss")
plt.plot(history["val_loss"], label="Val Loss")
plt.legend(); plt.show()

# ========================================
# 9. Embedding Visualizations (t-SNE, UMAP)
# ========================================
embeddings = out.detach().numpy()
tsne = TSNE(n_components=2).fit_transform(embeddings)
plt.scatter(tsne[:,0], tsne[:,1], c=y_enc, cmap="coolwarm"); plt.title("t-SNE Embeddings"); plt.show()

umap_emb = umap.UMAP(n_components=2).fit_transform(embeddings)
plt.scatter(umap_emb[:,0], umap_emb[:,1], c=y_enc, cmap="viridis"); plt.title("UMAP Embeddings"); plt.show()

# ========================================
# 10. Graph Node Visualization
# ========================================
import networkx as nx
G = nx.from_scipy_sparse_matrix(adj)
plt.figure(figsize=(6,6))
nx.draw(G, node_color=y_enc, node_size=20, cmap=plt.cm.Set1)
plt.title("Graph Node Visualization")
plt.show()

# ========================================
# 11. Confidence Predictions + Job Recommendations
# ========================================
probs = torch.softmax(out, dim=1)
confidence = probs.max(dim=1).values
pred_labels = probs.argmax(dim=1)

for i in range(5):
    print(f"Sample {i}: True={le.classes_[y_enc[i]]}, Pred={le.classes_[pred_labels[i]]}, Conf={confidence[i].item():.2f}")

### ***2) TEXTUAL FEATURES PIPELINE***

In [None]:
# ========================================
# TEXTUAL FEATURES PIPELINE
# ========================================
import torch
from transformers import BertTokenizer,BertModel,RobertaTokenizer,RobertaModel,DistilBertTokenizer,DistilBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns; import matplotlib.pyplot as plt

# Load reviews
texts = df["review_text"].fillna(" ").tolist()
labels = df["label"].map({"Negative":0,"No Opinion":1,"Positive":2}).tolist()

# Split
X_train,X_val,y_train,y_val = train_test_split(texts,labels,test_size=0.2,random_state=42)

# Example: BERT embeddings
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

inputs = tokenizer(X_val,return_tensors="pt",padding=True,truncation=True,max_length=128)
with torch.no_grad():
    embeddings = bert_model(**inputs).last_hidden_state[:,0,:]

# Classification layer
clf = nn.Linear(embeddings.shape[1],3)
outputs = clf(embeddings)

preds = torch.argmax(outputs,dim=1).numpy()
print(classification_report(y_val,preds))

cm = confusion_matrix(y_val,preds)
sns.heatmap(cm,annot=True,fmt="d",cmap="Blues"); plt.title("Confusion Matrix - BERT"); plt.show()



In [None]:
# ========================================
# Graph Construction (HAN, GraphSAGE, GNN)
# ========================================
import torch_geometric
from torch_geometric.data import Data
import networkx as nx

# Simple job-firm-review graph
edge_index = torch.tensor([[0,1,2,2],[1,2,0,1]],dtype=torch.long)
x = embeddings[:3]  # dummy node features
graph_data = Data(x=x,edge_index=edge_index)

# HAN Model
import torch_geometric.nn as pyg_nn
class HAN(torch.nn.Module):
    def __init__(self,in_dim,hid_dim,out_dim):
        super().__init__()
        self.conv = pyg_nn.GATConv(in_dim,hid_dim,heads=2)
        self.fc = nn.Linear(hid_dim*2,out_dim)
    def forward(self,x,edge_index):
        h = self.conv(x,edge_index)
        return self.fc(h)

han = HAN(in_dim=embeddings.shape[1],hid_dim=64,out_dim=3)
out = han(graph_data.x,graph_data.edge_index)

# Graph Visualization
G = nx.Graph(); G.add_edges_from(edge_index.numpy().T)
plt.figure(figsize=(6,6))
nx.draw(G,node_color="lightblue",with_labels=True,node_size=500)
plt.title("Graph Visualization (Jobs-Firms-Reviews)"); plt.show()



In [None]:

# ========================================
# Confidence prediction
# ========================================
softmax = nn.Softmax(dim=1)
conf = softmax(outputs)
print("Confidence Scores (sample):",conf[:5])

In [None]:
# ========================================
# 1. Load Libraries
# ========================================
import pandas as pd
import numpy as np
import re, string
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.manifold import TSNE
import umap.umap_ as umap

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel, DistilBertTokenizer, DistilBertModel

import torch_geometric
from torch_geometric.nn import GCNConv, HANConv, SAGEConv
from torch_geometric.data import Data

# ========================================
# 2. Load Dataset
# ========================================
df = pd.read_csv("glassdoor_reviews.csv")   # <-- place dataset here
df = df[['Headline','Pros','Cons','Rating']].dropna()

# Map labels: 1-2=Negative, 3=No Opinion, 4-5=Positive
def map_sentiment(r):
    if r <= 2: return "Negative"
    elif r == 3: return "No Opinion"
    else: return "Positive"

df['Label'] = df['Rating'].apply(map_sentiment)

print(df.head())

# ========================================
# 3. Text Preprocessing
# ========================================
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = " ".join([w for w in text.split() if w not in stop_words])
    return text

df['text'] = df['Headline'] + " " + df['Pros'] + " " + df['Cons']
df['text'] = df['text'].apply(clean_text)

# Train-Test Split
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Label_enc'] = le.fit_transform(df['Label'])
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['Label_enc'], test_size=0.2, random_state=42)

# ========================================
# 4. Baseline Models: LSTM / BiLSTM
# ========================================
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense

max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_len)

# ----- LSTM -----
model_lstm = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    LSTM(128),
    Dense(3, activation="softmax")
])
model_lstm.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
history_lstm = model_lstm.fit(X_train_seq, y_train, epochs=3, batch_size=64, validation_split=0.2)

# ----- BiLSTM -----
model_bilstm = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    Bidirectional(LSTM(128)),
    Dense(3, activation="softmax")
])
model_bilstm.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
history_bilstm = model_bilstm.fit(X_train_seq, y_train, epochs=3, batch_size=64, validation_split=0.2)

# ========================================
# 5. Transformer Models: BERT, RoBERTa, DistilBERT
# ========================================
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification

def train_transformer(model_name, train_texts, train_labels, test_texts, test_labels):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
    test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)

    class TextDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels
        def __len__(self): return len(self.labels)
        def __getitem__(self, idx):
            item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            return item

    train_dataset = TextDataset(train_encodings, list(train_labels))
    test_dataset = TextDataset(test_encodings, list(test_labels))

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=2,
        weight_decay=0.01,
        logging_dir="./logs",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
    )

    trainer.train()
    preds = trainer.predict(test_dataset)
    y_pred = np.argmax(preds.predictions, axis=1)
    print(classification_report(test_labels, y_pred, target_names=le.classes_))

# Run BERT, RoBERTa, DistilBERT
train_transformer("bert-base-uncased", X_train, y_train, X_test, y_test)
train_transformer("roberta-base", X_train, y_train, X_test, y_test)
train_transformer("distilbert-base-uncased", X_train, y_train, X_test, y_test)

# ========================================
# 6. HAN, GraphSAGE, Proposed GNN
# ========================================

from torch_geometric.nn import GCNConv

class SimpleGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(SimpleGNN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

# ========================================
# 7. Evaluation: Confusion Matrix + Reports
# ========================================
def plot_confusion(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=le.classes_, yticklabels=le.classes_)
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

# ========================================
# 8. Visualizations: t-SNE, UMAP
# ========================================
def plot_embeddings(X, y, method="tsne"):
    if method=="tsne":
        emb = TSNE(n_components=2).fit_transform(X)
    else:
        emb = umap.UMAP(n_components=2).fit_transform(X)
    plt.scatter(emb[:,0], emb[:,1], c=y, cmap="coolwarm", alpha=0.7)
    plt.title(f"{method.upper()} Visualization")
    plt.show()

# ========================================
# 9. Confidence Predictions + Sample Job Recommendations
# ========================================
sample_texts = ["Great culture and work life balance",
                "Toxic management and very stressful",
                "Decent company but long hours"]

# Use fine-tuned BERT for demonstration
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
encodings = tokenizer(sample_texts, return_tensors="pt", padding=True, truncation=True)
outputs = model(**encodings)
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
preds = torch.argmax(probs, axis=1)

for txt, p, pr in zip(sample_texts, preds, probs):
    print(f"Text: {txt}")
    print(f"Prediction: {le.classes_[p]}")
    print(f"Confidence: {pr[p].item():.2f}")
    print("-"*40)
