In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#read raw
ufo = pd.read_csv('editufo.csv')

#display(ufo)

#ufo_text = ufo[['text']]

#display(ufo_text)

import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words("english")
my_stopwords = ['like', 'seemed', 'could']
stopwords.extend(my_stopwords)

# matches Unicode word characters with one or more occurrences
regexp = RegexpTokenizer('\w+') 

ufo['text'] = ufo['text'].astype(str).str.lower()
#ufo.head(3)

ufo['text_token']=ufo['text'].apply(regexp.tokenize)
#ufo.head(3)

ufo['text_token'] = ufo['text_token'].apply(lambda x: [item for item in x if item not in stopwords])
#display(ufo)

ufo['text_string'] = ufo['text_token'].apply(lambda x: ' '.join([item for item in x if len(item)>2]))
#ufo.head(3)

all_words = ' '.join([word for word in ufo['text_string']])

tokenized_words = nltk.tokenize.word_tokenize(all_words)

from nltk.probability import FreqDist

fdist = FreqDist(tokenized_words)

##regional lists
def classify_region(state):
    # Define the regions
    northeast = ['CT', 'ME', 'MA', 'NH', 'RI', 'VT', 'NJ', 'NY', 'PA']
    midwest = ['IL', 'IN', 'MI', 'OH', 'WI', 'IA', 'KS', 'MN', 'MO', 'NE', 'ND', 'SD']
    south = ['DE', 'FL', 'GA', 'MD', 'NC', 'SC', 'VA', 'DC', 'WV', 'AL', 'KY', 'MS', 'TN', 'AR', 'LA', 'OK', 'TX']
    west = ['AZ', 'CO', 'ID', 'MT', 'NV', 'NM', 'UT', 'WY', 'AK', 'CA', 'HI', 'OR', 'WA']
    
    # Check which region the state belongs to
    if state in northeast:
        return 'Northeast'
    elif state in midwest:
        return 'Midwest'
    elif state in south:
        return 'South'
    elif state in west:
        return 'West'

# Add a new column to the data frame with the region classification
ufo['region'] = ufo['state'].apply(classify_region)

# Create separate data frames for each region
Northeast = ufo[ufo['region'] == 'Northeast']
Midwest = ufo[ufo['region'] == 'Midwest']
South = ufo[ufo['region'] == 'South']
West = ufo[ufo['region'] == 'West']

# Export each data frame to a CSV file
#Northeast.to_csv('Northeast.csv', index=False)
#Midwest.to_csv('Midwest.csv', index=False)
#South.to_csv('South.csv', index=False)
#West.to_csv('West.csv', index=False)

top50 = fdist.most_common(50)
fdist = pd.Series(dict(top50))

import seaborn as sns
sns.set_theme(style="ticks")

#sns.barplot(y=fdist.index, x=fdist.values, color='blue');

import plotly.express as px

fig = px.bar(y=fdist.index, x=fdist.values)

# sort values 
fig.update_layout(barmode='stack', yaxis={'categoryorder':'total ascending'})

#show plot
#fig.show()

nltk.download('vader_lexicon')

from nltk.sentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

from copy import copy, deepcopy

ufo_pol = copy(ufo)

ufo_pol['polarity'] = ufo_pol['text_string'].apply(lambda x: analyzer.polarity_scores(x))
#print(ufo_pol.tail(3))

# Change data structure
ufo_pol = pd.concat(
    [ufo_pol.drop(['polarity'], axis=1), 
     ufo_pol['polarity'].apply(pd.Series)], axis=1)


ufo_pol['sentiment'] = ufo_pol['compound'].apply(lambda x: 'positive' if x >0 else 'neutral' if x==0 else 'negative')
display(ufo_pol.head())

# Create new variable with sentiment "neutral," "positive" and "negative"
#ufo_pol['strong_sentiment'] = ufo_pol['compound'].apply(lambda x: 'positive' if x > .5 else 'negative' if x < -.5 else 'neutral')
#ufo_pol.head(50)

#sns.countplot(y='sentiment', data=ufo_pol, palette=['#b2d8d8',"#008080", '#db3d13'])
#sns.countplot(y='strong_sentiment', data=ufo_pol, palette=['#b2d8d8',"#008080", '#db3d13']);

# Lineplot
#g = sns.lineplot(x='date_time', y='compound', data=ufo_pol)

#g.set(xticklabels=[]) 
#g.set(title='Sentiment of Description')
#g.set(xlabel="2004 - 2019")
#g.set(ylabel="Sentiment")
#g.tick_params(bottom=False)

#g.axhline(0, ls='--', c = 'grey');

#ufo_pol.loc[ufo_pol['compound'].idxmax()].values
#ufo_pol.loc[ufo_pol['compound'].idxmin()].values 

# Check if there are any NaN values in the 'duration_sec' and 'region' columns
null_duration = ufo_pol['duration_sec'].isnull().any()
null_region = ufo_pol['region'].isnull().any()

# If there are any NaN values in either column, drop the rows with NaN values
if null_duration or null_region:
    ufo_pol.dropna(subset=['duration_sec', 'region'], inplace=True)

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Convert the 'compound' column to a float
#ufo_pol['compound'] = ufo_pol['compound'].astype(float)

# Select the features and target column
#X = ufo_pol[['region', 'duration_sec']]
#y = ufo_pol['compound']

# One-hot encode the categorical feature 'region'
#X = pd.get_dummies(X, columns=['region'])

# Split the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create and train the model
#model = RandomForestRegressor()
#model.fit(X_train, y_train)

# Evaluate the model on the test set
#score = model.score(X_test, y_test)
#print(f'Test score: {score:.2f}')


from sklearn.linear_model import LinearRegression


# Convert the 'compound' column to a float
ufo_pol['compound'] = ufo_pol['compound'].astype(float)

# Select the features and target column
X = ufo_pol[['compound', 'duration_sec']]
y = ufo_pol['region']

# One-hot encode the categorical feature 'region'
y = pd.get_dummies(y, columns=['region'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model on the test set
score = model.score(X_test, y_test)
print(f'Test score: {score:.2f}')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lesliebushara/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lesliebushara/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/lesliebushara/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,city,state,date_time,shape,text,city_latitude,city_longitude,duration_sec,text_token,text_string,region,neg,neu,pos,compound,sentiment
0,Chester,VA,2019-12-12 18:43:00,light,my wife was driving southeast on a fairly popu...,37.343152,-77.408582,5.0,"[wife, driving, southeast, fairly, populated, ...",wife driving southeast fairly populated main s...,South,0.085,0.859,0.056,0.0516,positive
1,Rocky Hill,CT,2019-03-22 18:30:00,circle,i think that i may caught a ufo on the nbc nig...,41.6648,-72.6393,4.0,"[think, may, caught, ufo, nbc, nightly, news, ...",think may caught ufo nbc nightly news aired ma...,Northeast,0.0,1.0,0.0,0.0,neutral
2,Ottawa,ON,2019-04-17 02:00:00,teardrop,i was driving towards the intersection of fall...,45.381383,-75.708501,10.0,"[driving, towards, intersection, fallowfield, ...",driving towards intersection fallowfield eagle...,,0.356,0.644,0.0,-0.7506,negative
3,Peoria,NY,2009-03-15 18:00:00,cigar,"in peoria, arizona, i saw a cigar shaped craft...",,,120.0,"[peoria, arizona, saw, cigar, shaped, craft, f...",peoria arizona saw cigar shaped craft floating...,Northeast,0.012,0.791,0.197,0.9626,positive
4,Kirbyville,TX,2019-04-02 20:25:00,disk,"the object has flashing lights that are green,...",30.6772,-94.0052,900.0,"[object, flashing, lights, green, blue, red, w...",object flashing lights green blue red white li...,South,0.097,0.776,0.126,0.5423,positive


Test score: -0.00
