In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from collections import Counter, defaultdict
import networkx as nx

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('conll2002')


# Load the dataset
dataset_path = 'fulltrain.csv'
df = pd.read_csv(dataset_path)

# Basic Dataset Information
num_rows, num_cols = df.shape
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_cols}")
print(df.head())
missing_values = df.isnull().sum()
print("Missing values:")
print(missing_values)

# Dataset statistics
print(df.info())

# Analyzing text data
df['Word Count'] = df.iloc[:, 1].apply(lambda x: len(str(x).split()))
print(df['Word Count'].describe())

max_row = df.loc[df['Word Count'].idxmax()]
print("Row with maximum words:")
print(max_row)

min_row = df.loc[df['Word Count'].idxmin()]
print("Row with minimum words:")
print(min_row)

word_counts = Counter(" ".join(df.iloc[:, 1]).split())
most_common_word = word_counts.most_common(1)[0]
print("Most common word:", most_common_word[0], "appears:", most_common_word[1], "times")

# Named Entity Recognition
with open(dataset_path, 'r', encoding='utf-8') as file:
    data = file.read()

tokens = word_tokenize(data[:1000000])  # Adjust according to file size
pos_tags = pos_tag(tokens)
ner_tags = ne_chunk(pos_tags)

# Extract entities
persons, locations, organizations = [], [], []
for subtree in ner_tags:
    if isinstance(subtree, nltk.Tree):
        entity_type = subtree.label()
        entity_tokens = [token for token, tag in subtree.leaves()]
        if entity_type == 'PERSON':
            persons.append(' '.join(entity_tokens))
        elif entity_type == 'GPE':
            locations.append(' '.join(entity_tokens))
        elif entity_type == 'ORGANIZATION':
            organizations.append(' '.join(entity_tokens))

print("Persons:", persons)
print("Locations:", locations)
print("Organizations:", organizations)

# Visualizations
dataset.hist(figsize=(10, 8))
plt.show()

plt.figure(figsize=(10, 8))
sns.boxplot(data=dataset)
plt.show()

sns.scatterplot(x='feature1', y='feature2', data=dataset)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Relationship between Feature 1 and Feature 2')
plt.show()

# Build and visualize an entity graph
G = nx.Graph()
for entity_list in [persons, locations, organizations]:
    for entity in entity_list:
        G.add_node(entity, type=entity_list[0])  # Assuming first element represents entity type
        for other_entity in entity_list:
            if entity != other_entity:
                G.add_edge(entity, other_entity)

pos = nx.spring_layout(G, seed=42)
plt.figure(figsize=(12, 8))
nx.draw(G, pos, with_labels=True, node_color='lightblue', node_size=2000, font_size=10, font_weight='bold')
plt.title('Named Entity Relations')
plt.show()
