In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the compressed matrix
matrix_data = np.load('adjacency_matrix.npz')
adj_matrix = matrix_data['matrix']

# Load node mapping
node_mapping = pd.read_csv('node_mapping.csv')

# Create DataFrame with proper indexing
df = pd.DataFrame(
    adj_matrix,
    index=node_mapping['node_id'],
    columns=node_mapping['node_id']
)

# Add node type information
df.index.name = 'source_id'
df.columns.name = 'target_id'

In [3]:
print("Matrix Statistics:")
print(f"Shape: {df.shape}")
print(f"Non-zero elements: {np.count_nonzero(adj_matrix)}")
print(f"Matrix density: {np.count_nonzero(adj_matrix) / (adj_matrix.shape[0] * adj_matrix.shape[1]):.4f}")

Matrix Statistics:
Shape: (14078, 14078)
Non-zero elements: 595717
Matrix density: 0.0030


In [4]:
# Get journal and paper indices
journal_mask = node_mapping['type'] == 'Journal'
journal_ids = node_mapping[journal_mask]['node_id'].values
paper_ids = node_mapping[~journal_mask]['node_id'].values

print(f"Number of journals: {len(journal_ids)}")
print(f"Number of papers: {len(paper_ids)}")

Number of journals: 66
Number of papers: 14012


In [6]:
# Cell 4: Analyze Journal-Journal Relationships (Fixed)
# Verify that all journal IDs exist in the DataFrame
valid_journal_ids = [jid for jid in journal_ids if jid in df.index and jid in df.columns]
print(f"\nValid journal IDs: {len(valid_journal_ids)}")

# Extract journal submatrix using only valid IDs
journal_submatrix = df.loc[valid_journal_ids, valid_journal_ids]

print("\nJournal-Journal Relationships:")
print(f"Number of journals: {len(valid_journal_ids)}")
print(f"Submatrix shape: {journal_submatrix.shape}")
print(f"Non-zero relationships: {np.count_nonzero(np.triu(journal_submatrix.values, k=1))}")

if np.count_nonzero(journal_submatrix.values) > 0:
    # Get only upper triangle values (excluding diagonal)
    upper_triangle = np.triu(journal_submatrix.values, k=1)
    nonzero_vals = upper_triangle[upper_triangle != 0]
    if len(nonzero_vals) > 0:
        print(f"Weight range: {nonzero_vals.min():.3f} to {nonzero_vals.max():.3f}")
        print(f"Average weight: {nonzero_vals.mean():.3f}")

# Optional: Display a sample of the journal submatrix
print("\nSample of journal submatrix (5x5):")
print(journal_submatrix.iloc[:5, :5])

# Cell 5: Debug Information
print("\nDebug Information:")
print("DataFrame index type:", type(df.index))
print("Journal IDs type:", type(journal_ids))
print("Sample index values:", list(df.index[:5]))
print("Sample journal IDs:", list(journal_ids[:5]))

# Verify data types are consistent
print("\nData type verification:")
print("Index dtype:", df.index.dtype)
print("Journal IDs dtype:", journal_ids.dtype)

# Check for any potential type mismatches
if df.index.dtype != journal_ids.dtype:
    print("Converting journal IDs to match index type...")
    journal_ids = journal_ids.astype(df.index.dtype)


Valid journal IDs: 66

Journal-Journal Relationships:
Number of journals: 66
Submatrix shape: (4356, 4356)
Non-zero relationships: 2145
Weight range: 0.057 to 0.057
Average weight: 0.057

Sample of journal submatrix (5x5):
target_id  NaN  NaN  NaN  NaN  NaN
source_id                         
NaN        0.0  0.0  0.0  0.0  0.0
NaN        0.0  0.0  0.0  0.0  0.0
NaN        0.0  0.0  0.0  0.0  0.0
NaN        0.0  0.0  0.0  0.0  0.0
NaN        0.0  0.0  0.0  0.0  0.0

Debug Information:
DataFrame index type: <class 'pandas.core.indexes.base.Index'>
Journal IDs type: <class 'numpy.ndarray'>
Sample index values: [nan, nan, nan, nan, nan]
Sample journal IDs: [nan, nan, nan, nan, nan]

Data type verification:
Index dtype: object
Journal IDs dtype: object


In [None]:

# Cell 2: Load Matrix and Mapping


# Cell 3: Basic Matrix Statistics


# Cell 4: Get Journal and Paper IDs


# Cell 5: Analyze Journal-Journal Relationships


# Cell 6: Analyze Paper-Paper Relationships
# Extract paper submatrix
paper_submatrix = df.loc[paper_ids, paper_ids]
print("\nPaper-Paper Relationships:")
print(f"Number of papers: {len(paper_ids)}")
print(f"Non-zero relationships: {np.count_nonzero(np.triu(paper_submatrix.values, k=1))}")
if np.count_nonzero(paper_submatrix.values) > 0:
    nonzero_vals = paper_submatrix.values[np.nonzero(paper_submatrix.values)]
    print(f"Weight range: {nonzero_vals.min():.3f} to {nonzero_vals.max():.3f}")
    print(f"Average weight: {nonzero_vals.mean():.3f}")

# Cell 7: Analyze Paper-Journal Relationships
# Extract paper-journal submatrix
paper_journal_submatrix = df.loc[paper_ids, journal_ids]
print("\nPaper-Journal Relationships:")
print(f"Non-zero relationships: {np.count_nonzero(paper_journal_submatrix.values)}")

# Cell 8: Create Heatmap of Journal-Journal Relationships
plt.figure(figsize=(12, 10))
sns.heatmap(journal_submatrix, cmap='YlOrRd', center=0)
plt.title('Journal-Journal Similarity Heatmap')
plt.show()

# Cell 9: Distribution of Similarity Weights
# For Journal-Journal relationships
journal_weights = journal_submatrix.values[np.triu_indices_from(journal_submatrix.values, k=1)]
journal_weights = journal_weights[journal_weights != 0]

plt.figure(figsize=(10, 6))
plt.hist(journal_weights, bins=50, alpha=0.7, color='blue', label='Journal-Journal')
plt.title('Distribution of Journal-Journal Similarity Weights')
plt.xlabel('Similarity Weight')
plt.ylabel('Count')
plt.legend()
plt.show()

# Cell 10: Example of Accessing Specific Relationships
# Get relationships for a specific journal
specific_journal_id = journal_ids[0]
journal_relationships = df.loc[specific_journal_id, journal_ids]
journal_relationships = journal_relationships[journal_relationships > 0]  # Get only non-zero relationships

print("Relationships for journal:", specific_journal_id)
print(journal_relationships.sort_values(ascending=False).head())

# Cell 11: Top Similar Journal Pairs
# Create a DataFrame of journal pairs and their similarities
journal_pairs = []
for i in range(len(journal_ids)):
    for j in range(i+1, len(journal_ids)):
        weight = journal_submatrix.iloc[i, j]
        if weight > 0:
            journal_pairs.append({
                'Journal1': journal_ids[i],
                'Journal2': journal_ids[j],
                'Similarity': weight
            })

journal_pairs_df = pd.DataFrame(journal_pairs)
print("\nTop 10 Most Similar Journal Pairs:")
print(journal_pairs_df.sort_values('Similarity', ascending=False).head(10))

# Cell 12: Journal Connectivity Analysis
journal_connections = (journal_submatrix > 0).sum()
print("\nJournal Connectivity:")
print("Top 10 Most Connected Journals:")
print(journal_connections.sort_values(ascending=False).head(10))

# Cell 13: Save Analysis Results
analysis_results = {
    'journal_stats': {
        'count': len(journal_ids),
        'total_relationships': np.count_nonzero(np.triu(journal_submatrix.values, k=1)),
        'avg_weight': journal_weights.mean() if len(journal_weights) > 0 else 0,
        'min_weight': journal_weights.min() if len(journal_weights) > 0 else 0,
        'max_weight': journal_weights.max() if len(journal_weights) > 0 else 0
    },
    'paper_stats': {
        'count': len(paper_ids),
        'total_relationships': np.count_nonzero(np.triu(paper_submatrix.values, k=1))
    },
    'paper_journal_stats': {
        'total_relationships': np.count_nonzero(paper_journal_submatrix.values)
    }
}

# Convert to DataFrame for easy viewing
analysis_df = pd.DataFrame.from_dict(analysis_results, orient='index')
print("\nAnalysis Summary:")
print(analysis_df)