In [1]:
import pandas as pd
df = pd.read_csv("../data/ufo_sightings.csv")
# Display the DataFrame
print(df)

              date_time             city_area state country ufo_shape  \
0      10/10/1949 20:30            san marcos    tx      us  cylinder   
1      10/10/1949 21:00          lackland afb    tx     NaN     light   
2      10/10/1955 17:00  chester (uk/england)   NaN      gb    circle   
3      10/10/1956 21:00                  edna    tx      us    circle   
4      10/10/1960 20:00               kaneohe    hi      us     light   
...                 ...                   ...   ...     ...       ...   
80327    9/9/2013 21:15             nashville    tn      us     light   
80328    9/9/2013 22:00                 boise    id      us    circle   
80329    9/9/2013 22:00                  napa    ca      us     other   
80330    9/9/2013 22:20                vienna    va      us    circle   
80331    9/9/2013 23:00                edmond    ok      us     cigar   

       encounter_length described_encounter_length  \
0                2700.0                 45 minutes   
1              

In [8]:
df_new = df.dropna()
print(df_new)
df_new.isnull().values.any()


              date_time   city_area state country ufo_shape  encounter_length  \
0      10/10/1949 20:30  san marcos    tx      us  cylinder            2700.0   
3      10/10/1956 21:00        edna    tx      us    circle              20.0   
4      10/10/1960 20:00     kaneohe    hi      us     light             900.0   
5      10/10/1961 19:00     bristol    tn      us    sphere             300.0   
7      10/10/1965 23:45     norwalk    ct      us      disk            1200.0   
...                 ...         ...   ...     ...       ...               ...   
80327    9/9/2013 21:15   nashville    tn      us     light             600.0   
80328    9/9/2013 22:00       boise    id      us    circle            1200.0   
80329    9/9/2013 22:00        napa    ca      us     other            1200.0   
80330    9/9/2013 22:20      vienna    va      us    circle               5.0   
80331    9/9/2013 23:00      edmond    ok      us     cigar            1020.0   

      described_encounter_l

False

In [9]:
df_new.to_csv('../data/ufo_sightings_NMV.csv', index=False) # saving it as No Missing Values (MSV)

In [6]:
import csv
from collections import Counter
import re
from nltk.corpus import stopwords

# Download NLTK stopwords
import nltk
nltk.download('stopwords')

# Read the data
data = []
with open('../data/ufo_sightings_NMV.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        data.append(row)

# Extract descriptions
descriptions = [row['description'] for row in data]

# Initialize NLTK stopwords
stop_words = set(stopwords.words('english'))
stop_words.add('44')

# Tokenize and count words
word_counter = Counter()
for description in descriptions:
    words = re.findall(r'\b(?![0-9]+\b)[a-zA-Z]+\b', description.lower())
    # words = re.findall(r'\b\w+\b', description.lower())  # Tokenize words
    words = [word for word in words if word not in stop_words]  # Remove stop words
    word_counter.update(words)

# Find the most common words
most_common_words = word_counter.most_common(50)  # Change 10 to the number of top words you want to find

# Print the results
for word, count in most_common_words:
    print(f'{word}: {count}')


[nltk_data] Downloading package stopwords to /Users/nachi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


lights: 15155
light: 14959
sky: 14338
object: 12530
bright: 11100
moving: 7810
orange: 7560
white: 6860
red: 6775
shaped: 5981
saw: 5642
seen: 4776
craft: 4561
like: 4541
flying: 4244
ufo: 4231
two: 3726
large: 3698
objects: 3480
night: 3252
quot: 3091
green: 3075
one: 3006
across: 2999
hovering: 2949
triangle: 2910
north: 2905
fast: 2780
blue: 2768
star: 2763
south: 2665
west: 2657
east: 2653
shape: 2646
formation: 2463
high: 2459
slowly: 2293
speed: 2260
three: 2258
nuforc: 2214
note: 2207
pd: 2190
triangular: 2189
low: 2159
glowing: 2120
fireball: 2072
sighting: 2035
sound: 1997
ball: 1996
looked: 1934


In [8]:
with open('../data/most_common_words.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Word', 'Frequency'])  # Write header
    writer.writerows(most_common_words)

In [13]:
import csv
import re
from collections import Counter

# Read the data
data = []
with open('../data/ufo_sightings_NMV.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        data.append(row)

# Extract descriptions, latitude, and longitude
descriptions = [row['description'] for row in data]
coordinates = [(row['latitude'], row['longitude']) for row in data]

# Initialize NLTK stopwords
stop_words = set(stopwords.words('english'))

# Tokenize and count words while associating with coordinates
word_freq_coordinates = {}
for description, (latitude, longitude) in zip(descriptions, coordinates):
    words = re.findall(r'\b(?![0-9]+\b)[a-zA-Z]+\b', description.lower())
    words = [word for word in words if word not in stop_words]
    for word in words:
        if word not in word_freq_coordinates:
            word_freq_coordinates[word] = {'frequency': 0, 'coordinates': []}
        word_freq_coordinates[word]['frequency'] += 1
        word_freq_coordinates[word]['coordinates'].append((latitude, longitude))

# Find the most common words
most_common_words = sorted(word_freq_coordinates.items(), key=lambda x: x[1]['frequency'], reverse=True)[:10]

# Associate most common words with coordinates
word_coordinates = {}
for word, _ in most_common_words:
    word_coordinates[word] = word_freq_coordinates[word]['coordinates']

# Now you have a dictionary where the keys are the most common words
# and the values are lists of coordinates associated with each word


In [14]:
import csv

# Assuming word_coordinates is the dictionary containing word-coordinate mappings

# Define the file name for the CSV
output_file = '../data/word_coordinates.csv'

# Write the word-coordinate mappings to the CSV file
with open(output_file, 'w', newline='') as csvfile:
    fieldnames = ['word', 'latitude', 'longitude']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    # Write the header
    writer.writeheader()
    
    # Write each word-coordinate mapping
    for word, coordinates in word_coordinates.items():
        for latitude, longitude in coordinates:
            writer.writerow({'word': word, 'latitude': latitude, 'longitude': longitude})

print("Word-coordinate mappings saved to", output_file)

Word-coordinate mappings saved to ../data/word_coordinates.csv
