In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
%matplotlib inline
import seaborn as sns

import itertools
import re
import string
import pickle
import os

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.model_selection import KFold

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords, wordnet
from wordcloud import WordCloud
from copy import deepcopy

from IPython.display import (
    Markdown as md,
    Latex,
    HTML,
)

from tqdm.auto import tqdm
import json

# set plot style
sns.set()

from google.colab import drive

drive.mount('/content/drive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Mounted at /content/drive


In [None]:
df = pd.read_csv("/content/drive/MyDrive/nlp data/Data/news_feed_all_docs_dropped.csv")

In [None]:
df.head()

Unnamed: 0,categories,url,title,content,author,date,spider_type
0,['Manchester City'],https://www.mirror.co.uk/sport/football/news/p...,10 most expensive Premier League keepers with ...,The top 10 most valuable Premier League goalke...,['Matthew Cooper'],2023-09-25 14:31:18+00:00,newsnow
1,['Manchester United'],https://metro.co.uk/2023/09/25/gran-74-left-sh...,Gran in crash with Marcus Rashford had no idea...,The England striker was involved in the crash ...,['Liam Coleman'],2023-09-25 15:58:13+00:00,newsnow
2,['Manchester United'],https://www.manutd.com/en/news/detail/man-utd-...,Match preview: United v Palace,Skip to Main NavigationSkip to contentHow to f...,['Matthew Holt'],2023-09-25 16:12:33+00:00,newsnow
3,['Manchester United'],https://www.teamtalk.com/manchester-united/ten...,Man Utd want €50m LaLiga star in huge January ...,Manchester United are reportedly weighing up a...,['James Marshment'],2023-09-25 15:51:58+00:00,newsnow
4,['Manchester City'],https://www.manchestercity.news/soon-pep-guard...,Pep Guardiola delivers new injury update on Ma...,Pep Guardiola gave an injury update on Manches...,['Tom Procter'],2023-09-25 15:34:43+00:00,newsnow


In [None]:
!pip install spacy tqdm
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

In [None]:
top_five_content = df['content'].head(5)

# Process each entry with spaCy
for i, content in enumerate(top_five_content):
    doc = nlp(content)
    print(f"Text {i + 1}:")
    print(f"Entities: {[ent.text for ent in doc.ents]}")
    print(f"TOKENS: {[token.text for token in doc]}")
    print(f"POS Tags: {[token.pos_ for token in doc]}")
    print(f"Dependency Labels: {[token.dep_ for token in doc]}")
    print("\n")

Text 1:
Entities: ['10', 'Premier League', 'Liverpool', 'Alisson', "Manchester City's", 'Liverpool', 'Ederson and AlissonLiverpool', 'Alisson', "Manchester City's", 'Ederson', 'two', 'The Football Observatory', 'Alisson', 'Ederson', 'nine', '10', 'two', 'Mirror Football', '10', 'Arijanet Muric - Burnley', 'Burnley', 'first', 'James Trafford', 'Aijanet Muric', '24-year-old', 'last year', '2026', '9', 'Emiliano Martinez - Aston', '20m/£17.3', 'World Cup', 'Argentina', 'last year', 'Emiliano Martinez', 'Arsenal', '2020', 'Bernd Leno', 'Aston Villa', 'later that same year', 'Martinez', '31-year-old', 'three-year', 'last January', 'David Raya - Arsenal', 'David Raya', 'the summer', 'Getty Images', 'Arsenal', 'Brentford', 'next summer', 'David Raya', 'Aaron Ramsdale', 'Emirates', 'Spanish', 'Raya', 'two-year', 'Brentford', '7', 'Jordan Pickford - Everton', '20m/£17.3', 'England', 'first', 'Gareth Southgate', 'Jordan Pickford', 'Everton', '2017', 'Sunderland', '29-year-old', 'the years', 'one

In [None]:
# Process each entry with spaCy and extract names (PERSON) and clubs (ORG)
for i, content in enumerate(top_five_content):
    doc = nlp(content)
    names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    clubs = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
    print(f"Text {i + 1}:")
    print(f"Names: {names}")
    print(f"Clubs: {clubs}")
    print("\n")

Text 1:
Names: ['Alisson', 'Alisson', 'Ederson', 'Arijanet Muric - Burnley', 'James Trafford', 'Aijanet Muric', 'Emiliano Martinez - Aston', 'Emiliano Martinez', 'Bernd Leno', 'Aston Villa', 'Martinez', 'David Raya - Arsenal', 'David Raya', 'David Raya', 'Aaron Ramsdale', 'Raya', 'Jordan Pickford - Everton', 'Gareth Southgate', 'Jordan Pickford', 'Odysseas Vlachodimos', 'Matt Turner', 'Robert Sanchez - Chelsea', 'Robert Sanchez', 'Robin Jones - AFC Bournemouth', 'Brighton', 'Jason Steele', 'Robert Sanchez', 'Edouard Mendy', 'Kepa Arrizabalaga', 'Andre Onana - Man Utd', "David de Gea's", 'Alisson', 'Alisson', 'Jurgen Klopp', 'Claudio Bravo', 'Aaron Ramsdale - Arsenal', "Mikel Arteta's", 'Raya', 'Aaron Ramsdale', 'Gunners', 'Raya', 'Virgin Media']
Clubs: ['Premier League', 'Ederson and AlissonLiverpool', 'Ederson', 'The Football Observatory', 'Mirror Football', 'Burnley', 'Getty Images', 'Brentford', 'Everton', 'Everton', 'Vlachodimos - Nottingham Forest', 'Nottingham Forest', 'Forest', 

In [None]:
first_content = df['content'].iloc[0]

# Process the first entry with spaCy
doc = nlp(first_content)

# Extract names (PERSON) and clubs (ORG)
names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
clubs = [ent.text for ent in doc.ents if ent.label_ == "ORG"]

# Display the content and extracted entities
print("Content of the first entry:")
print(first_content)
print("\nExtracted names:")
print(names)
print("\nExtracted clubs:")
print(clubs)

Content of the first entry:
The top 10 most valuable Premier League goalkeepers have been revealed, with Liverpool's Alisson and Manchester City's Ederson missing out on the number one spotManchester City and Liverpool goalkeepers Ederson and AlissonLiverpool star Alisson and Manchester City's Ederson are widely viewed as two of the best goalkeepers in the world - but where do they rank among the Premier League's most valuable shot-stoppers? Well, The Football Observatory has all the answers and you might be surprised to learn that neither Alisson or Ederson take the number one spot. When ranking players' value, they take a wide range of factors into account: age, contract length, form and so on. A total of nine different clubs are represented in the top 10, with Arsenal the only club having two players on the list. Here, Mirror Football gives you the rundown on the league's most valuable goalkeepers... 10. Arijanet Muric - Burnley (€20million/£17.3m) After losing his spot as Burnley's

In [None]:
num_chunks = 5
chunk_size = len(df) // num_chunks

# Save each chunk to a separate CSV file
for i in range(num_chunks):
    start_idx = i * chunk_size
    end_idx = (i + 1) * chunk_size if i < num_chunks - 1 else len(df)

    # Extract the chunk
    chunk = df.iloc[start_idx:end_idx]

    # Save the chunk to a CSV file
    chunk.to_csv(f"/content/drive/MyDrive/nlp data/Data/news_feed_chunk_{i + 1}.csv", index=False)
    print(f"Chunk {i + 1} saved.")

Chunk 1 saved.
Chunk 2 saved.
Chunk 3 saved.
Chunk 4 saved.
Chunk 5 saved.


In [None]:
from tqdm import tqdm

def extract_names(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

def extract_clubs(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == "ORG"]

def extract_fee(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == "MONEY"]

# Initialize tqdm for progress monitoring
tqdm.pandas(desc="Processing content")

# Load the first chunk
chunk_number = 1
chunk = pd.read_csv(f"/content/drive/MyDrive/nlp data/Data/news_feed_chunk_{chunk_number}.csv")

# Apply the functions to the 'content' column with progress monitoring
chunk['names'] = chunk['content'].progress_apply(lambda x: ', '.join(extract_names(x)))
chunk['clubs'] = chunk['content'].progress_apply(lambda x: ', '.join(extract_clubs(x)))
chunk['fee'] = chunk['content'].progress_apply(lambda x: ', '.join(extract_fee(x)))

# Save the processed chunk to a new CSV file
chunk.to_csv(f"/content/drive/MyDrive/nlp data/Data/news_feed_chunk_{chunk_number}_processed.csv", index=False)

# Display the updated DataFrame with new columns
chunk[['content', 'names', 'clubs', 'fee']].head()

Processing content:  99%|█████████▉| 26841/27001 [42:31<00:20,  7.99it/s]