In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from googlesearch import search
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def search_urls(query):
    urls = []
    try:
        for url in search(query,num =10):
            urls.append(url)
    except Exception as e:
        print(f"Error during Google search: {e}")
    return urls

def scrape_content(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        paragraphs = soup.find_all("p")
        content = " ".join([para.get_text() for para in paragraphs])
        content = re.sub(r"\s+", " ", content)
        return content.strip()
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return ""

def analyze_text(text, model_name):
    factors = ['urgency', 'complexity', 'resistance', 'change_level']
    factor_keywords = {
        'urgency': ['immediate', 'urgent', 'critical', 'priority'],
        'complexity': ['complex', 'difficult', 'challenging', 'multifaceted'],
        'resistance': ['resistance', 'opposition', 'pushback', 'reluctance'],
        'change_level': ['organization-wide', 'departmental', 'team-level']
    }
    
    scores = {}
    for factor, keywords in factor_keywords.items():
        factor_score = sum(text.lower().count(word) for word in keywords)
        scores[factor] = factor_score
    
    total_score = sum(scores.values())
    if total_score > 0:
        normalized_scores = {f"{model_name}_{factor}": score / total_score 
                             for factor, score in scores.items()}
    else:
        normalized_scores = {f"{model_name}_{factor}": 0 for factor in factors}
    
    return normalized_scores

def calculate_model_efficiency(all_text, models):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([all_text])
    
    model_efficiencies = {}
    for model in models:
        model_query = vectorizer.transform([model])
        relevance_score = cosine_similarity(model_query, tfidf_matrix)[0][0]
        model_text = re.findall(f"(?i){model}.*?(?=\n\n|\Z)", all_text, re.DOTALL)
        model_text = " ".join(model_text)
        
        factor_scores = analyze_text(model_text, model)
        factor_scores = {k: v * relevance_score for k, v in factor_scores.items()}
        model_efficiencies.update(factor_scores)
    
    return model_efficiencies

def main_pipeline(change_strategies):
    all_text = ""
    model_efficiencies = defaultdict(lambda: defaultdict(float))
    
    for strategy in change_strategies:
        print(f"\nAnalyzing {strategy}...")
        urls = search_urls(f"{strategy} change management:google scholar")
        
        for url in urls:
            print(f"Scraping {url}...")
            content = scrape_content(url)
            all_text += content + "\n\n"
        
        efficiencies = calculate_model_efficiency(all_text, change_strategies)
        
        for key, value in efficiencies.items():
            model, factor = key.split('_', 1)
            model_efficiencies[model][factor] += value
    
    # Normalize the accumulated scores
    for model in model_efficiencies:
        total = sum(model_efficiencies[model].values())
        if total > 0:
            model_efficiencies[model] = {k: v/total for k, v in model_efficiencies[model].items()}
    
    return pd.DataFrame(model_efficiencies).T

# Run the pipeline for specific change management models
if __name__ == "__main__":
    change_strategies = ["Lewin's Change Model", "McKinsey 7S"]
    #change_strategies = ["ADKAR", "Kotter's 8-Step"]
    
    efficiency_df = main_pipeline(change_strategies)
    
    print("\nCalculated Model Efficiencies:")
    print(efficiency_df)


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from googlesearch import search
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Updated keyword structure with extremes and midpoints
FACTOR_KEYWORDS = {
    'urgency': {
        'low': ['long-term', 'gradual', 'phased', 'incremental'],
        'medium': ['managed', 'planned', 'scheduled', 'moderate'],
        'high': ['urgent', 'immediate', 'critical', 'priority']
    },
    'complexity': {
        'low': ['simple', 'basic', 'straightforward', 'routine'],
        'medium': ['moderate', 'structured', 'organized', 'manageable'],
        'high': ['complex', 'challenging', 'multifaceted', 'difficult']
    },
    'resistance': {
        'low': ['acceptance', 'support', 'adoption', 'welcome'],
        'medium': ['neutral', 'mixed', 'varied', 'conditional'],
        'high': ['resistance', 'opposition', 'pushback', 'reluctance']
    },
    'change_level': {
        'low': ['individual', 'personal', 'role-specific', 'task-level'],
        'medium': ['departmental', 'team-level', 'group', 'unit'],
        'high': ['organization-wide', 'enterprise', 'cross-functional', 'strategic']
    }
}

def search_urls(query):
    urls = []
    try:
        for url in search(query,num =10):
            urls.append(url)
    except Exception as e:
        print(f"Error during Google search: {e}")
    return urls

def scrape_content(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        paragraphs = soup.find_all("p")
        content = " ".join([para.get_text() for para in paragraphs])
        content = re.sub(r"\s+", " ", content)
        return content.strip()
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return ""


def analyze_text(text, model_name):
    factor_scores = {}
    
    for factor, levels in FACTOR_KEYWORDS.items():
        # Count occurrences for each level
        counts = {
            'low': sum(text.lower().count(word) for word in levels['low']),
            'medium': sum(text.lower().count(word) for word in levels['medium']),
            'high': sum(text.lower().count(word) for word in levels['high'])
        }
        
        total = sum(counts.values())
        if total > 0:
            # Calculate weighted score (low: 0, medium: 0.5, high: 1)
            weighted_score = (
                counts['low'] * 0.1 + 
                counts['medium'] * 0.5 + 
                counts['high'] * 0.
            ) / total
        else:
            weighted_score = 0
            
        factor_scores[f"{model_name}_{factor}"] = weighted_score
    
    return factor_scores

def calculate_model_efficiency(all_text, models):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([all_text])
    
    model_efficiencies = {}
    for model in models:
        model_query = vectorizer.transform([model])
        relevance_score = cosine_similarity(model_query, tfidf_matrix)[0][0]
        
        # Find model-specific content
        model_text = re.findall(fr"(?i){re.escape(model)}.*?(?=\n\n|\Z)", all_text, re.DOTALL)
        model_text = " ".join(model_text)
        
        # Get factor scores and weight by relevance
        raw_scores = analyze_text(model_text, model)
        weighted_scores = {k: v * relevance_score for k, v in raw_scores.items()}
        model_efficiencies.update(weighted_scores)
    
    return model_efficiencies

# (Rest of the code remains the same as in your original implementation)
def main_pipeline(change_strategies):
    all_text = ""
    model_efficiencies = defaultdict(lambda: defaultdict(float))
    
    for strategy in change_strategies:
        print(f"\nAnalyzing {strategy}...")
        urls = search_urls(f"{strategy} change management strategy models:google scholar")
        
        for url in urls:
            print(f"Scraping {url}")
            content = scrape_content(url)
            all_text += content + "\n\n"
        
        efficiencies = calculate_model_efficiency(all_text, change_strategies)
        
        for key, value in efficiencies.items():
            model, factor = key.split('_', 1)
            model_efficiencies[model][factor] += value
    
    # Normalize the accumulated scores
    for model in model_efficiencies:
        total = sum(model_efficiencies[model].values())
        if total > 0:
            model_efficiencies[model] = {k: v/total for k, v in model_efficiencies[model].items()}
    
    return pd.DataFrame(model_efficiencies).T
if __name__ == "__main__":
    change_strategies = ["Lewin's Change Model", "McKinsey 7S", "Kotter's 8-Step", "ADKAR"]
    
    efficiency_df = main_pipeline(change_strategies)
    
    print("\nEnhanced Model Analysis:")
    print(efficiency_df)


In [None]:
import requests
from bs4 import BeautifulSoup
import re
from googlesearch import search
import numpy as np
from sentence_transformers import SentenceTransformer, util

# Initialize sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

MODEL_COEFFICIENTS = {
    "Lewin": {'urgency': 0.267402, 'complexity': 0.312645, 'resistance': 0.169246, 'change_level': 0.250708},
    "McKinsey 7S": {'urgency': 0.353329, 'complexity': 0.215851, 'resistance': 0.161507, 'change_level': 0.269313},
    "Kotter": {'urgency': 0.326516, 'complexity': 0.298532, 'resistance': 0.187739, 'change_level': 0.187214},
    "ADKAR": {'urgency': 0.347329, 'complexity': 0.302746, 'resistance': 0.195419, 'change_level': 0.154505}
}

FACTOR_PHRASES = {
    'urgency': [
        ("critical emergency", 2.0), 
        ("immediate action required", 1.8),
        ("time-sensitive priority", 1.5)
    ],
    'complexity': [
        ("system integration challenges", 2.2),
        ("multidisciplinary technical complexity", 2.0),
        ("workflow disruption risks", 1.7)
    ],
    'resistance': [
        ("employee apprehension", 2.1),
        ("active change opposition", 2.0),
        ("skill gap concerns", 1.9)
    ]
}

def generate_search_queries(role, change):
    """Generate simplified search queries"""
    return [
        f"{change} {role} site:linkedin.com",
        f"{change} {role} filetype:pdf",
        f"{change} workforce resistance site:academia.edu",
        f"{role} change management best practices {change}",
        f"{change} implementation complexity",
        f"{role} team {change} urgency"
    ]

def scrape_content(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, "html.parser")
        paragraphs = soup.find_all(['p', 'article', 'section'])
        content = " ".join([p.get_text() for p in paragraphs])
        return re.sub(r'\s+', ' ', content).strip()
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return ""

def analyze_factors(text):
    """Enhanced factor scoring with non-linear amplification"""
    factor_scores = {}
    text_embedding = model.encode(text)
    
    for factor, phrases in FACTOR_PHRASES.items():
        max_score = 0
        for phrase, weight in phrases:
            phrase_embed = model.encode(phrase)
            similarity = util.pytorch_cos_sim(text_embedding, phrase_embed).item()
            amplified_score = (similarity ** 2) * weight
            if amplified_score > max_score:
                max_score = amplified_score
                
        factor_scores[factor] = 1 / (1 + np.exp(-max_score*3))
    
    return factor_scores

def get_contextual_scores(user_input):
    """Main analysis pipeline with error handling"""
    role, change = user_input.split("Change:")[0].replace("Role: ", "").strip(), user_input.split("Change:")[1].strip()
    queries = generate_search_queries(role, change)
    
    all_text = ""
    for query in queries:
        try:
            # Remove advanced parameter and use standard search
            for url in search(query, num = 3):
                content = scrape_content(url)
                print(url)
                if content:
                    all_text += content + "\n\n"
        except Exception as e:
            print(f"Search error for '{query}': {str(e)}")
    
    return analyze_factors(all_text[:10000])

def recommend_model(user_input, change_level_value=0.2):
    """Enhanced scoring with non-linear amplification"""
    factor_scores = get_contextual_scores(user_input)
    
    model_scores = {}
    for model, coeffs in MODEL_COEFFICIENTS.items():
        score = (
            (factor_scores['urgency']**2 * coeffs['urgency']) +
            (factor_scores['complexity']**1.5 * coeffs['complexity']) +
            (factor_scores['resistance']**1.8 * coeffs['resistance']) +
            (change_level_value**1.2 * coeffs['change_level'])
        )
        model_scores[model] = score
    
    return {
        "factor_scores": factor_scores,
        "recommendation": max(model_scores, key=model_scores.get),
        "model_scores": {k: round(v, 3) for k, v in model_scores.items()}
    }

if __name__ == "__main__":
    user_input = "Role: Electrical engineer working in the robotics team. Change: AI in the robotics project."
    result = recommend_model(user_input)

    print("Factor Scores (0-1 scale):")
    for factor, score in result['factor_scores'].items():
        print(f"{factor.capitalize()}: {score:.2f}")

    print("\nRecommended Model:", result['recommendation'])
    print("\nModel Scores:")
    for model, score in result['model_scores'].items():
        print(f"{model}: {score:.2f}")


In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
from kaggle.api.kaggle_api_extended import KaggleApi

# Step 1: Data Collection from Kaggle
api = KaggleApi()
api.authenticate()

# Download relevant datasets
api.dataset_download_files('arashnic/hr-analytics-job-change-of-data-scientists', path='data/', unzip=True)
api.dataset_download_files('gladdenme/factory-workers-daily-performance-attrition-s', path='data/', unzip=True)

# Step 2: Data Preparation
hr_df = pd.read_csv('data/aug_train.csv')
factory_df = pd.read_csv('data/Factory_Workers_Performance.csv')

# Feature Engineering
def preprocess_data(df, dataset_type):
    if dataset_type == 'hr':
        df['role_context'] = df.apply(lambda x: f"""
            Role: {x['enrollee_id']}, 
            Domain: {x['city']}, 
            Experience: {x['experience']} years, 
            Education: {x['education_level']}
        """, axis=1)
        return df[['role_context', 'target']]
    else:
        df['role_context'] = df.apply(lambda x: f"""
            Role: {x['Position']}, 
            Department: {x['Department']}, 
            Tenure: {x['Tenure']} months, 
            Performance: {x['PerformanceRating']}
        """, axis=1)
        return df[['role_context', 'Attrition']]

hr_processed = preprocess_data(hr_df, 'hr')
factory_processed = preprocess_data(factory_df, 'factory')

# Step 3: Create Training Data with Change Management Context
change_contexts = {
    'ADKAR': "Individual skill development and phased adoption",
    'Kotter': "Urgent organizational transformation",
    'Lewin': "Behavioral pattern changes",
    'McKinsey': "Structural realignment"
}

def create_training_examples(row):
    examples = []
    for model, context in change_contexts.items():
        text = f"""
        Role Context: {row['role_context']}
        Change Type: {context}
        Recommended Strategy: {model}
        """
        examples.append(text)
    return examples

# Create training data
train_data = []
for _, row in hr_processed.iterrows():
    train_data.extend(create_training_examples(row))
for _, row in factory_processed.iterrows():
    train_data.extend(create_training_examples(row))

# Step 4: LLM Fine-Tuning
tokenizer = AutoTokenizer.from_pretrained("microsoft/xtremedistil-l12-h384-uncased")
model = AutoModelForSequenceClassification.from_pretrained("microsoft/xtremedistil-l12-h384-uncased", num_labels=4)

# Prepare dataset
train_texts, val_texts = train_test_split(train_data, test_size=0.2)
labels = [change_contexts.keys().index(text.split("Recommended Strategy: ")[-1].strip()) for text in train_texts]

class ChangeDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=512)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ChangeDataset(train_texts, labels)
val_dataset = ChangeDataset(val_texts, [change_contexts.keys().index(t.split("Recommended Strategy: ")[-1].strip()) for t in val_texts])

# Training configuration
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Step 5: Model Training
trainer.train()

# Step 6: Prediction Pipeline
def recommend_strategy(user_input):
    role_part = user_input.split("Change:")[0].replace("Role: ", "").strip()
    change_part = user_input.split("Change:")[1].strip()
    
    inputs = tokenizer(
        f"Role Context: {role_part}\nChange Type: {change_part}",
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )
    
    outputs = model(**inputs)
    predicted_idx = torch.argmax(outputs.logits).item()
    return list(change_contexts.keys())[predicted_idx]

# Example Usage
user_input = "Role: Robotics Engineer with 5 years experience. Change: AI-Powered Quality Control Implementation"
print(f"Recommended Strategy: {recommend_strategy(user_input)}")
