In [None]:
# Step 1: Install and Load Required Libraries
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m102.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
import pandas as pd
from spacy.training import Example

# Step 2: Load the Pretrained spaCy Model
nlp = spacy.load("en_core_web_sm", exclude=["lookups"])

# Step 3: Load and Preprocess the Data
# Load training dataset
train_df = pd.read_csv('/content/sample_data/Train_1.csv')

# Drop rows with missing text or location
train_df = train_df.dropna(subset=['text', 'location'])

# Step 4: Extract Locations using spaCy's NER
def extract_locations(text):
    doc = nlp(text)
    # Extract locations labeled as GPE, LOC, FAC (geopolitical entities, locations, facilities)
    locations = [ent.text for ent in doc.ents if ent.label_ in ['GPE', 'LOC', 'FAC']]
    return " ".join(locations)

# Apply spaCy NER on training data to extract locations
train_df['predicted_locations'] = train_df['text'].apply(extract_locations)

# Display a few samples
print(train_df[['tweet_id', 'text', 'location', 'predicted_locations']].head())

# Step 5: Fine-Tuning spaCy's NER Model (Optional)
# Prepare training data for spaCy
train_data = []

def get_non_overlapping_entities(doc, locations):
    """Helper function to prevent overlapping entities."""
    used_spans = []
    entities = []
    for loc in locations.split():
        start = doc.find(loc)
        if start != -1:
            end = start + len(loc)
            overlap = False
            # Check for overlap with previously added entities
            for span in used_spans:
                if not (end <= span[0] or start >= span[1]):
                    overlap = True
                    break
            if not overlap:
                entities.append((start, end, 'GPE'))  # Mark location as GPE
                used_spans.append((start, end))
    return entities

for _, row in train_df.iterrows():
    doc = row['text']
    annotations = {'entities': get_non_overlapping_entities(doc, row['location'])}
    train_data.append((doc, annotations))

# Fine-tune the NER model
ner = nlp.get_pipe('ner')
for _, annotations in train_data:
    for ent in annotations['entities']:
        ner.add_label(ent[2])

# Disable other pipes and only train the NER component
optimizer = nlp.resume_training()  # Use resume_training instead of initialize

for itn in range(10):  # Number of iterations
    for text, annotations in train_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], sgd=optimizer)

# Save the fine-tuned model
nlp.to_disk('fine_tuned_ner_model')






                 tweet_id                                               text  \
1  ID_1001136696589631488  Flash floods struck a Maryland city on Sunday,...   
2  ID_1001136950345109504  State of emergency declared for Maryland flood...   
3  ID_1001137334056833024  Other parts of Maryland also saw significant d...   
4  ID_1001138374923579392  Catastrophic Flooding Slams Ellicott City, Mar...   
5  ID_1001138377717157888  WATCH: 1 missing after flash #FLOODING devasta...   

                 location                  predicted_locations  
1                Maryland                             Maryland  
2                Maryland                             Maryland  
3      Baltimore Maryland  Maryland Baltimore Dundalk Maryland  
4  Ellicott City Maryland                             Maryland  
5  Ellicott City Maryland               Ellicott City Maryland  




In [None]:
# Step 6: Load Test Data and Predict Locations
test_df = pd.read_csv('/content/sample_data/Test.csv')

# Apply the fine-tuned NER model on test data
test_df['predicted_locations'] = test_df['text'].apply(extract_locations)

# Step 7: Create Submission File
submission = test_df[['tweet_id', 'predicted_locations']].rename(columns={'predicted_locations': 'location'})
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")

Submission file created successfully!


In [None]:
# prompt: in submission.csv file  column location value null replace it with most occuring location

import pandas as pd

# Load the submission file
submission = pd.read_csv('submission.csv')

# Find the most occurring location
most_frequent_location = submission['location'].mode()[0]

# Replace null values with the most frequent location
submission['location'].fillna(most_frequent_location, inplace=True)

# Save the updated submission file
submission.to_csv('submission.csv', index=False)

print("Submission file updated successfully!")


Submission file updated successfully!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  submission['location'].fillna(most_frequent_location, inplace=True)
