In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
def extract_abstract_column(df):
    for col in df.columns:
        if col.strip().lower() == 'abstract':
            return col
    return None
def extract_tfidf_from_excel(file_path, max_keywords=10):
    xls = pd.ExcelFile(file_path)
    for sheet_name in xls.sheet_names:
        try:
            df = pd.read_excel(xls, sheet_name=sheet_name, header=1)
            abstract_col = extract_abstract_column(df)
            if not abstract_col:
                print(f" No 'Abstract' column in: {sheet_name}")
                continue
            abstracts = df[abstract_col].dropna().astype(str).tolist()
            if not abstracts:
                print(f"No abstracts in sheet: {sheet_name}")
                continue
            combined_text = ' '.join(abstracts)
            vectorizer = TfidfVectorizer(stop_words='english', max_features=max_keywords)
            X = vectorizer.fit_transform([combined_text])
            keywords = vectorizer.get_feature_names_out()
            print(f"\n Researcher: {sheet_name}")
            print("Top Keywords:", ', '.join(keywords))
        except Exception as e:
            print(f" Error in sheet {sheet_name}: {e}")
extract_tfidf_from_excel("/content/DL-TASK1.xlsx")


 No 'Abstract' column in: 1-Danqi Chen

 Researcher: 2-Kai Zhao
Top Keywords: based, chitosan, control, error, estimation, method, performance, proposed, state, water

 Researcher: 3-Irene Li
Top Keywords: based, framework, knowledge, language, large, llms, medical, models, patients, qa

 Researcher: 4-Linqi Song
Top Keywords: based, code, data, federated, learning, llms, methods, model, models, user

 Researcher: 5-Thang Vu
Top Keywords: anchors, bler, cascade, feqe, fronthaul, image, paper, proposed, rate, rpn

 Researcher: 6-Nora Hollenstein
Top Keywords: data, eeg, eye, features, human, language, models, processing, reading, tracking

 Researcher: 7-Gabriella Lapesa
Top Keywords: argument, datasets, definitions, discourse, llms, moderation, political, quality, research, sexism

 Researcher: 8-Qiaoqiao She
Top Keywords: based, data, image, language, model, models, questions, table, text, training

 Researcher: 9-Ashkan Kazemi
Top Keywords: claim, claims, fact, language, matching, mi

In [None]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
model = SentenceTransformer('all-MiniLM-L6-v2')
excel_path = '/content/DL-TASK1.xlsx'
all_abstracts = []
xls = pd.ExcelFile(excel_path)
for sheet_name in xls.sheet_names:
    df = xls.parse(sheet_name)
    df.columns = [col.lower() for col in df.columns]
    if 'abstract' in df.columns:
        abstracts = df['abstract'].dropna().astype(str).tolist()
        all_abstracts.extend(abstracts)
all_abstracts = all_abstracts[:20]
if len(all_abstracts) < 2:
    print("Not enough abstracts found to compare. Found:", len(all_abstracts))
else:
    embeddings = model.encode(all_abstracts, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
    pairwise_scores = [
        cosine_scores[i][j].item()
        for i in range(len(all_abstracts))
        for j in range(i + 1, len(all_abstracts))
    ]
    if pairwise_scores:
        average_similarity = sum(pairwise_scores) / len(pairwise_scores)
        print("Average Pairwise Similarity Score:", round(average_similarity, 4))
        if average_similarity > 0.75:
            print("Most abstracts are on a focused area.")
        elif average_similarity < 0.5:
            print("Abstracts cover diverse topics.")
        else:
            print("Moderate similarity: Some overlap, some diversity.")
    else:
        print("No pairwise comparisons were made.")


Not enough abstracts found to compare. Found: 0


In [None]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
model = SentenceTransformer('all-MiniLM-L6-v2')
excel_path = '/content/DL-TASK1.xlsx'
xls = pd.ExcelFile(excel_path)
results = []
print("\n--- Computing Similarity Per Researcher ---")
for sheet_name in xls.sheet_names:
    try:
        df = xls.parse(sheet_name, header=1)
        df.columns = [col.strip().lower() for col in df.columns]

        if 'abstract' in df.columns:
            abstracts = df['abstract'].dropna().astype(str).tolist()

            if len(abstracts) < 2:
                print(f" Not enough abstracts for: {sheet_name} (found {len(abstracts)})")
                results.append((sheet_name, len(abstracts), "N/A", "Not Enough Data"))
                continue
            embeddings = model.encode(abstracts, convert_to_tensor=True)
            cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
            pairwise_scores = [
                cosine_scores[i][j].item()
                for i in range(len(abstracts))
                for j in range(i + 1, len(abstracts))
            ]
            avg_similarity = sum(pairwise_scores) / len(pairwise_scores)
            if avg_similarity >= 0.6:
                diversity = " Highly Focused"
            elif avg_similarity >= 0.3:
                diversity = "Moderately Diverse"
            else:
                diversity = "Highly Diverse"
            results.append((sheet_name, len(abstracts), round(avg_similarity, 4), diversity))
        else:
            print(f" 'Abstract' column not found in sheet: {sheet_name}")
            results.append((sheet_name, 0, "N/A", "No Abstract Column"))
    except Exception as e:
        print(f" Error processing {sheet_name}: {e}")
        results.append((sheet_name, 0, "Error", "Processing Error"))
print("\nPer Researcher Similarity & Diversity Classification:")
print(f"{'Researcher':<30} | {'#Abstracts':<10} | {'Avg Similarity':<15} | {'Diversity'}")
print("-" * 80)
for name, count, score, diversity in results:
    print(f"{name:<30} | {count:<10} | {score:<15} | {diversity}")



--- Computing Similarity Per Researcher ---
 'Abstract' column not found in sheet: 1-Danqi Chen
 'Abstract' column not found in sheet: Author_Profile

Per Researcher Similarity & Diversity Classification:
Researcher                     | #Abstracts | Avg Similarity  | Diversity
--------------------------------------------------------------------------------
1-Danqi Chen                   | 0          | N/A             | No Abstract Column
2-Kai Zhao                     | 20         | 0.047           | Highly Diverse
3-Irene Li                     | 20         | 0.2638          | Highly Diverse
4-Linqi Song                   | 20         | 0.2621          | Highly Diverse
5-Thang Vu                     | 4          | 0.2877          | Highly Diverse
6-Nora Hollenstein             | 20         | 0.5227          | Moderately Diverse
7-Gabriella Lapesa             | 20         | 0.4671          | Moderately Diverse
8-Qiaoqiao She                 | 13         | 0.4311          | Moderately

In [None]:
import pandas as pd
import itertools
import random
data = [
    {"Researcher": "Kai Zhao", "Similarity": 0.047},
    {"Researcher": "Irene Li", "Similarity": 0.2638},
    {"Researcher": "Linqi Song", "Similarity": 0.2621},
    {"Researcher": "Thang Vu", "Similarity": 0.2877},
    {"Researcher": "Nora Hollenstein", "Similarity": 0.5227},
    {"Researcher": "Gabriella Lapesa", "Similarity": 0.4671},
    {"Researcher": "Qiaoqiao She", "Similarity": 0.4311},
    {"Researcher": "Ashkan Kazemi", "Similarity": 0.2868},
    {"Researcher": "Yan Zhang", "Similarity": 0.0776},
    {"Researcher": "Arjun Reddy Akula", "Similarity": 0.3002},
    {"Researcher": "Saneem Ahmed Chemmengath", "Similarity": 0.3715},
    {"Researcher": "William Merrill", "Similarity": 0.3296},
    {"Researcher": "Yixin Nie", "Similarity": 0.2291},
    {"Researcher": "Urmish Thakker", "Similarity": 0.4001},
    {"Researcher": "Zhiqing Sun", "Similarity": 0.2902},
    {"Researcher": "Jinhua Du", "Similarity": 0.1444},
    {"Researcher": "Prashant Mathur", "Similarity": 0.218},
    {"Researcher": "Sanqiang Zhao", "Similarity": 0.2688},
    {"Researcher": "Md Mosharaf Hossain", "Similarity": 0.1196}
]

df = pd.DataFrame(data)
def classify_diversity(sim):
    if sim < 0.2:
        return "Highly Diverse"
    elif sim < 0.5:
        return "Moderately Diverse"
    else:
        return "Less Diverse"

df["DiversityLevel"] = df["Similarity"].apply(classify_diversity)
high_diverse = df[df["DiversityLevel"] == "Highly Diverse"]
mod_diverse = df[df["DiversityLevel"] == "Moderately Diverse"]
pairs = list(itertools.product(high_diverse["Researcher"], mod_diverse["Researcher"]))
themes = [
    {
        "theme": "Cognitive Multilingual Understanding",
        "justification": "Pairing researchers exploring multilingual or cross-lingual models with those studying semantic or discourse-level representations enables deeper cognitive grounding.",
        "fields": "Multilingual NLP + Cognitive Semantics"
    },
    {
        "theme": "Explainable and Trustworthy NLP Systems",
        "justification": "Merging low-overlap researchers with moderate similarity can foster models that combine symbolic reasoning with deep learning for interpretability.",
        "fields": "Explainable AI + Neural Reasoning"
    },
    {
        "theme": "Cross-Domain Adaptation for Low-Resource Languages",
        "justification": "Diverse linguistic expertise can complement domain-specialized NLP to build adaptable, transferable systems.",
        "fields": "Transfer Learning + Domain-Specific NLP"
    }
]
random.seed(42)
selected_pairs = random.sample(pairs, 3)
results = []
for idx, (r1, r2) in enumerate(selected_pairs):
    theme = themes[idx % len(themes)]
    results.append({
        "Researcher 1": r1,
        "Researcher 2": r2,
        "Collaboration Theme": theme["theme"],
        "Justification": theme["justification"],
        "Fields Bridged": theme["fields"]
    })
collab_df = pd.DataFrame(results)
print(collab_df)




  Researcher 1              Researcher 2  \
0    Jinhua Du           Prashant Mathur   
1     Kai Zhao  Saneem Ahmed Chemmengath   
2     Kai Zhao                Linqi Song   

                                 Collaboration Theme  \
0               Cognitive Multilingual Understanding   
1            Explainable and Trustworthy NLP Systems   
2  Cross-Domain Adaptation for Low-Resource Langu...   

                                       Justification  \
0  Pairing researchers exploring multilingual or ...   
1  Merging low-overlap researchers with moderate ...   
2  Diverse linguistic expertise can complement do...   

                            Fields Bridged  
0   Multilingual NLP + Cognitive Semantics  
1        Explainable AI + Neural Reasoning  
2  Transfer Learning + Domain-Specific NLP  


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_abstract_column(df):
    for col in df.columns:
        if isinstance(col, str) and col.strip().lower() == 'abstract':
            return col
    return None

def extract_tfidf_from_excel(file_path, max_keywords=10):
    xls = pd.ExcelFile(file_path)

    for sheet_name in xls.sheet_names:
        try:
            found = False
            for header_row in range(0, 5):  # Try first 5 rows as header candidates
                df = pd.read_excel(xls, sheet_name=sheet_name, header=header_row)
                abstract_col = extract_abstract_column(df)

                if abstract_col:
                    found = True
                    break

            if not found:
                print(f" No 'Abstract' column in: {sheet_name}")
                continue

            abstracts = df[abstract_col].dropna().astype(str).tolist()
            if not abstracts:
                print(f"No abstracts in sheet: {sheet_name}")
                continue

            combined_text = ' '.join(abstracts)

            vectorizer = TfidfVectorizer(stop_words='english', max_features=max_keywords)
            X = vectorizer.fit_transform([combined_text])
            keywords = vectorizer.get_feature_names_out()

            print(f"\n Researcher: {sheet_name}")
            print("Top Keywords:", ', '.join(keywords))

        except Exception as e:
            print(f"Error in sheet {sheet_name}: {e}")

# Example usage
extract_tfidf_from_excel("/content/DL-TASK1.xlsx")



 Researcher: 1-Danqi Chen
Top Keywords: data, knowledge, language, llms, model, models, performance, task, tasks, training

 Researcher: 2-Kai Zhao
Top Keywords: based, chitosan, control, error, estimation, method, performance, proposed, state, water

 Researcher: 3-Irene Li
Top Keywords: based, framework, knowledge, language, large, llms, medical, models, patients, qa

 Researcher: 4-Linqi Song
Top Keywords: based, code, data, federated, learning, llms, methods, model, models, user

 Researcher: 5-Thang Vu
Top Keywords: anchors, bler, cascade, feqe, fronthaul, image, paper, proposed, rate, rpn

 Researcher: 6-Nora Hollenstein
Top Keywords: data, eeg, eye, features, human, language, models, processing, reading, tracking

 Researcher: 7-Gabriella Lapesa
Top Keywords: argument, datasets, definitions, discourse, llms, moderation, political, quality, research, sexism

 Researcher: 8-Qiaoqiao She
Top Keywords: based, data, image, language, model, models, questions, table, text, training

 

In [None]:
import pandas as pd
import spacy

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")  # Run `python -m spacy download en_core_web_sm` if not installed

def extract_abstract_column(df):
    for col in df.columns:
        if col.strip().lower() == 'abstract':
            return col
    return None

def perform_ner_from_excel(file_path):
    xls = pd.ExcelFile(file_path)

    for sheet_name in xls.sheet_names:
        try:
            df = pd.read_excel(xls, sheet_name=sheet_name, header=1)
            abstract_col = extract_abstract_column(df)

            if not abstract_col:
                print(f" No 'Abstract' column in: {sheet_name}")
                continue

            abstracts = df[abstract_col].dropna().astype(str).tolist()
            if not abstracts:
                print(f"No abstracts in sheet: {sheet_name}")
                continue

            combined_text = ' '.join(abstracts)
            doc = nlp(combined_text)

            print(f"\n Researcher: {sheet_name}")
            print("Named Entities Found (label: text):")
            for ent in doc.ents:
                print(f"{ent.label_}: {ent.text}")

        except Exception as e:
            print(f" Error in sheet {sheet_name}: {e}")

# Example usage
perform_ner_from_excel("/content/DL-TASK1.xlsx")


 No 'Abstract' column in: 1-Danqi Chen

 Researcher: 2-Kai Zhao
Named Entities Found (label: text):
DATE: 18 years ago
ORG: CoVs
ORG: CoVs
MONEY: 2019-nCoV
GPE: Wuhan
GPE: China
DATE: 12 December 2019
CARDINAL: 2,794
CARDINAL: 80
DATE: 26 January 2020
CARDINAL: five
PERCENT: 79.6%
DATE: 2019
PERCENT: 96%
CARDINAL: seven
DATE: 2019
CARDINAL: 2019
ORG: IoT
ORG: IoT
ORG: IoT
CARDINAL: three
PERSON: Lyapunov
ORG: BLF
ORDINAL: First
ORDINAL: second
CARDINAL: around 10
PERCENT: around 92%
PERCENT: 13.1%
PERCENT: 16.5%
ORG: State
PRODUCT: SoC
NORP: Coulomb
ORG: Kalman
GPE: Kalman
ORG: SoC
NORP: Kalman
ORDINAL: First
ORDINAL: Second
ORG: SoC
ORG: OCV-SoC
ORDINAL: Third
ORG: AEKF
ORG: SoC
ORG: SoC
PERCENT: less than 2%
CARDINAL: one
ORG: CNN
ORG: ResNet
ORG: DLA
ORG: ImageNet
ORG: https://mmcheng.net/res2net/.
CARDINAL: 1
CARDINAL: 2
CARDINAL: 3
CARDINAL: zero
ORG: Euler-Lagrange
PERSON: Lyapunov
CARDINAL: 1
NORP: C1
CARDINAL: 2
CARDINAL: 3
ORG: Nussbaum
CARDINAL: zero
CARDINAL: 4
ORG: NN
PERSO