In [30]:
import os
import pandas as pd

folder_path = "CleanedLinkedInData"
csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

adj_list = {}
all_people = set()

# Build mutual connections
for filename in csv_files:
    owner = os.path.splitext(filename)[0]
    file_path = os.path.join(folder_path, filename)
    df = pd.read_csv(file_path)

    df["Full Name"] = df["First Name"].astype(str).str.strip() + " " + df["Last Name"].astype(str).str.strip()

    all_people.add(owner)
    all_people.update(df["Full Name"].values)

    if owner not in adj_list:
        adj_list[owner] = []

    for name in df["Full Name"]:
        if name == owner:
            continue

        if name not in adj_list[owner]:
            adj_list[owner].append(name)

        if name not in adj_list:
            adj_list[name] = []
        if owner not in adj_list[name]:
            adj_list[name].append(owner)

for person in adj_list:
    adj_list[person] = sorted(adj_list[person])

print(f"Total people: {len(adj_list)}\n")
print("{")
for person, connections in adj_list.items():
    print(f"    '{person}': {connections},")
print("}")


Total people: 29215

{
    'Aaditya Raj': ['Aakash Deep', 'Aakash Kumar', 'Aarti Patil', 'Abhinab Roy', 'Abhishek Kumar', 'Abhishek Tripathi', 'Achal Agrawal PhD', 'Aditya Singh', 'Aditya agrahari', 'Adityansh Chand', 'Afzal Raza', 'Afzl Raza', 'Ajay Jatav', 'Ajay Kumar', 'Ajeet Kumar', 'Ajit Yadav', 'Akanksha', 'Alok Raj', 'Aman Adarsh', 'Aman Singh', 'Aman Verma', 'Amir Khan', 'Amit Diwakar', 'Amit Kumar', 'Amit Singhal', 'Ammar Husain', 'Amrita Kumari', 'Amrita Yadav', 'Anamika Kumari', 'Anand Pandey', 'Animesh Awasthi', 'Ankit Kumar', 'Ankita Pancholi', 'Anmol Kumar', 'Anoop Kumar', 'Anshu Kumar', 'Anshul Sharma', 'Anuradha Tiwari', 'Arjun Kadam', 'Arpit Sahu', 'Arpit Tiwari', 'Arpita Tripathi', 'Arun Kumar', 'Arun Singh', 'Aryan Saini', 'Ashutosh Kumar', 'Ashwin Yadav', 'Aslam Khan', 'Avinash kumar', 'Ayush Katiyar', 'Ayush Kumar', 'Ayush Yadav', 'Bhagwan singh', 'Bhagwati Chouhan', 'Bharat Suthar', 'Bhaskar Mahato', 'Bikram Ravidas', 'Byagari Kumar', 'Byagari Praveen Kumar', 'CHA

In [31]:
degree = {}
for person in adj_list:
    if person not in degree:
        degree[person] = 0
    degree[person] += len(adj_list[person])
    
for name, deg in sorted(degree.items(), key=lambda x: x[1], reverse=True):
    print(f"{name}: degree {deg}")

total_degrees = sum(len(friends) for friends in adj_list.values())
num_nodes = len(adj_list)
average_degree = total_degrees / num_nodes if num_nodes > 0 else 0


print(f" Average Degree: {average_degree: .2f}")

Rohit Malviya: degree 4337
Ravi Rajput: degree 4118
Manoj Dewda: degree 4019
Ramraj Nagar: degree 3764
Nirmal Mewada: degree 3575
Pranjal Dubey: degree 3387
Challa Trivedh Kumar: degree 2933
Neeraj Parmar: degree 2715
Himanshu Kumar: degree 2710
Aman Singh: degree 2685
Byagari Kumar: degree 2521
Janu Chaudhary: degree 2187
Ujjval Baijal: degree 2184
Prabhat Patidar: degree 2130
Alok Raj: degree 2034
Rajiv Kumar: degree 1949
Chandan Giri: degree 1938
Shubham Kumar: degree 1820
Prem Kumar: degree 1797
Ayush Kumar: degree 1713
Aryan Saini: degree 1619
Ompal Yadav: degree 1574
Shivang Dubey: degree 1539
Shubham Kang: degree 1488
Shivam Shukla: degree 1451
Mayank Raj: degree 1387
Mohit Sharma: degree 1282
Manish Kumar Tiwari: degree 1255
Ishant Bhoyar: degree 1030
Gaurav Rathore: degree 1018
Ranjeet Yadav: degree 927
Anamika Kumari: degree 890
Monu Rajpoot: degree 890
Prachi Dhakad: degree 882
Ekta Kumari: degree 838
Mehtab Alam: degree 812
Bhaskar Mahato: degree 792
Vishal Kumar: degree 77

In [48]:
connections = degree
thresholds = [1000, 2000, 3000, 4000]
threshold_counts = {}

for t in thresholds:
    count = sum(1 for v in connections.values() if v > t)
    threshold_counts[f"> {t}"] = count

for label, count in threshold_counts.items():
    print(f"{label:>6}: {count}")

> 1000: 30
> 2000: 15
> 3000: 6
> 4000: 3


In [27]:
import json
import random
import os
import statistics

json_file_path ="adjacency_list.json"

if not os.path.exists(json_file_path):
    print(f" File not found: {json_file_path}")
else:
    try:
        with open(json_file_path, "r", encoding="utf-8") as f:
            adjacency_list = json.load(f)
    except json.JSONDecodeError as e:
        print(f" JSON decode error: {e}")
        adjacency_list = {}

    students = [s for s, neighbors in adjacency_list.items() if neighbors]

    if len(students) < 2:
        print(" Not enough students with connections.")
    else:
        def random_walk(graph, start, end, max_steps=15):
            path = [start]
            current = start
            for _ in range(max_steps):
                neighbors = graph.get(current, [])
                if not neighbors:
                    break
                next_node = random.choice(neighbors)
                path.append(next_node)
                if next_node == end:
                    break
                current = next_node
            return path

        def prune_path(path):
            seen = set()
            pruned = []
            for node in path:
                if node not in seen:
                    pruned.append(node)
                    seen.add(node)
            return pruned

        num_examples = 50
        walk_lengths, pruned_lengths = [], []
        examples = []

        for _ in range(num_examples):
            start, end = random.sample(students, 2)
            walk = random_walk(adjacency_list, start, end)
            pruned = prune_path(walk)

            walk_lengths.append(len(walk))
            pruned_lengths.append(len(pruned))

            examples.append({
                "start": start,
                "end": end,
                "walk": walk,
                "pruned": pruned
            })


        for idx, ex in enumerate(examples, 1):
            print(f"\n Example {idx}: {ex['start']} to {ex['end']}")
            print("   Random Walk : " + " , ".join(ex['walk']))
            print("   Pruned Path : " + " , ".join(ex['pruned']))

        def safe_mode(data):
            try:
                return statistics.mode(data)
            except statistics.StatisticsError:
                return "No unique mode"

        print("\n" + "="*70)
        print(" STATISTICAL SUMMARY")
        print("="*70)

        summary = {
            "Total Examples"            : num_examples,
            "Average Walk Length"       : round(statistics.mean(walk_lengths), 2),
            "Average Pruned Path Length": round(statistics.mean(pruned_lengths), 2),
            "Minimum Walk Length"       : min(walk_lengths),
            "Maximum Walk Length"       : max(walk_lengths),
            "Minimum Pruned Path Length": min(pruned_lengths),
            "Maximum Pruned Path Length": max(pruned_lengths),
            "Median Walk Length"        : statistics.median(walk_lengths),
            "Median Pruned Path Length" : statistics.median(pruned_lengths),
            "Mode Walk Length"          : safe_mode(walk_lengths),
            "Mode Pruned Length"        : safe_mode(pruned_lengths),
            "Std Dev of Walk Length"       : round(statistics.stdev(walk_lengths), 2),
            "Std Dev of Pruned Length"     : round(statistics.stdev(pruned_lengths), 2),
        }

        for k, v in summary.items():
            print(f"{k:<30}: {v}")



 Example 1: Vedant Dadhich to Aditya Priyadarshi
   Random Walk : Vedant Dadhich , Prabhat_Patidar - prabhat patidar , Duleshwar Kumar Verma , Connections - RAVI RAJPUT , Sanashish Sanki , Neeraj_Parmar - NEERAJ PARMAR , Tanmay Singh , Ramraj_Nagar - Ramraj Nagar , Karvika Kushwaha , Ramraj_Nagar - Ramraj Nagar , Rohit L , Connections - RAVI RAJPUT , Sachchidanand Mishra , Connections - RAVI RAJPUT , Madhuri Sabale , Shubham Kumar - Shubham Kumar
   Pruned Path : Vedant Dadhich , Prabhat_Patidar - prabhat patidar , Duleshwar Kumar Verma , Connections - RAVI RAJPUT , Sanashish Sanki , Neeraj_Parmar - NEERAJ PARMAR , Tanmay Singh , Ramraj_Nagar - Ramraj Nagar , Karvika Kushwaha , Rohit L , Sachchidanand Mishra , Madhuri Sabale , Shubham Kumar - Shubham Kumar

 Example 2: Umesh kumar to Uvesh Shaikh
   Random Walk : Umesh kumar , Prem kumar , Garima Minglani , Prem kumar , Ishika Verma , Prem kumar , Kushagra Nigam , Monu_Rajpoot - Monu Rajpoot , Shobhit Gautam , Anuradha_Tiwari - Anurad

In [19]:
import os
import pandas as pd

def top_five_companies_from_folder(folder_path):
    # Create an empty dictionary to count company occurrences
    company_count = {}

    # List all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            # Get the full file path
            file_path = os.path.join(folder_path, filename)

            # Read the CSV into a DataFrame
            df = pd.read_csv(file_path)

            # Drop rows where company is missing
            df = df.dropna(subset=["Company"])

            # Count company occurrences in this file
            for company in df["Company"]:
                if company in company_count:
                    company_count[company] += 1
                else:
                    company_count[company] = 1

    # Sort companies by count in descending order and get the top 5
    sorted_companies = sorted(company_count.items(), key=lambda x: x[1], reverse=True)[:10]

    # Print the Top 5 Companies
    print(" Top 10 Companies Across All CSVs:")
    for company, count in sorted_companies:
        print(f"- {company}: {count} connections")

    return sorted_companies

# Set your folder path containing the CSV files
folder_path = "CleanedLinkedInData"

# Run the function to extract company names from all CSVs in the folder
company_list = top_five_companies_from_folder(folder_path)



 Top 10 Companies Across All CSVs:
- Sitare University: 1205 connections
- HCLTech: 1184 connections
- Google: 606 connections
- Amazon: 525 connections
- Dakshana Foundation: 495 connections
- Microsoft: 411 connections
- Tata Consultancy Services: 370 connections
- ZingHR: 366 connections
- Selfemployed: 350 connections
- Freelance: 318 connections
