##### Loading Packages

In [5]:
import nltk
from nltk.corpus import stopwords
import itertools
from itertools import chain
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.learning_curve import learning_curve
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.pipeline import Pipeline
%matplotlib inline

## Exploratory Data Analysis and Database Cleaning

In [6]:
NYT1970s=pd.read_csv('./Assets/NYT 1970s Shootings-2.csv')
NYT1980s=pd.read_csv('./Assets/NYT 1980s Shootings-3.csv')
NYT1990s=pd.read_csv('./Assets/NYT 1990s Shootings-3.csv')
NYT2000s=pd.read_csv('./Assets/NYT 2000s Shootings-2.csv')
NYT2010s=pd.read_csv('./Assets/NYT 2010s-4 Shootings.csv')

In [7]:
NYT1970s=NYT1970s[['Incident Name','Title','Article','Shooter Race']]
NYT1980s=NYT1980s[['Incident Name','Title','Article','Shooter Race']]
NYT1990s=NYT1990s[['Incident Name','Title','Article','Shooter Race']]
NYT2000s=NYT2000s[['Incident Name','Title','Article','Shooter Race']]
NYT2010s=NYT2010s[['Incident Name','Title','Article','Shooter Race']]
shootingdb= pd.read_excel('./Assets/MSA/Stanford_MSA_Database_for_release_06142016.xlsx')

In [8]:
NYT = pd.concat([NYT1970s,NYT1980s, NYT1990s, NYT2000s,NYT2010s])
NYT.reset_index(drop=True)

Unnamed: 0,Incident Name,Title,Article,Shooter Race
0,NOLA PD,,,
1,Clara Barton Elementary,,,
2,Olean High School,Sniper's Classmate Says Guns Were ‘Whole Life’,The attack at the school has stunned this comm...,White
3,Olean High School,"3 Killed and 9 Wounded By an Upstate Sniper, 18",The youth was charged with three counts of mur...,White
4,LA Computer Learning Center,,,
5,Cal State Fullerton,,,
6,Grover Cleveland Elementary School,San Diego Girl Slays 2 With Rifle And Wounds 9...,Special weapons and tactics officers from the ...,White
7,Grover Cleveland Elementary School,Tomboy and Gun Enthusiast,"SAN DIEGO, Jan. 29 (AP) — Brenda Spencer's cla...",White
8,Grover Cleveland Elementary School,Coast Sniper Vowed She Would ‘Do Something Big’,"SAN DIEGO, Jan. 30 — Wally Spencer's eyes were...",White
9,Univeristy of South Carolina,The New York Times,Radioactive Tritium Seizure Brings Bankruptcy ...,African American


In [9]:
#Ritika EDA
def eda(dataframe):
    print "Missing Values \n \n", dataframe.isnull().sum(),"\n" #find missing values
    print "Duplicate Rows \n", dataframe.duplicated().sum(),"\n" #find duplicated values
    print "Dataframe Types \n \n", dataframe.dtypes,"\n" #datatypes of each column
    print "Dataframe Shape \n", dataframe.shape,"\n" #number of rows and columns
    print "Dataframe Describe \n \n", dataframe.describe(include='all'),"\n" #Describe all columns
    for feature in dataframe: # Prints unique values for each column 
        print feature
        print dataframe[feature].nunique()

In [10]:
eda(NYT)

Missing Values 
 
Incident Name      0
Title            133
Article          133
Shooter Race     133
dtype: int64 

Duplicate Rows 
11 

Dataframe Types 
 
Incident Name    object
Title            object
Article          object
Shooter Race     object
dtype: object 

Dataframe Shape 
(1194, 4) 

Dataframe Describe 
 
                            Incident Name               Title  \
count                                1194                1061   
unique                                291                1021   
top     Tucscon, Arizona - Gabby Giffords  The New York Times   
freq                                   83                   7   

                                                  Article Shooter Race  
count                                                1061         1061  
unique                                               1032            8  
top     The Lede is a blog that remixes national and i...        White  
freq                                                    9     

In [11]:
NYT.dropna(inplace=True)
NYT.drop_duplicates(inplace=True)

In [12]:
eda(NYT)

Missing Values 
 
Incident Name    0
Title            0
Article          0
Shooter Race     0
dtype: int64 

Duplicate Rows 
0 

Dataframe Types 
 
Incident Name    object
Title            object
Article          object
Shooter Race     object
dtype: object 

Dataframe Shape 
(1053, 4) 

Dataframe Describe 
 
                            Incident Name               Title  \
count                                1053                1053   
unique                                161                1021   
top     Tucscon, Arizona - Gabby Giffords  The New York Times   
freq                                   83                   7   

                                                  Article Shooter Race  
count                                                1053         1053  
unique                                               1032            8  
top     The Lede is a blog that remixes national and i...        White  
freq                                                    9          723 

In [13]:
NYT.replace("N.R.A","NRA",inplace=True)

### Generating Stop Words

In [None]:
names=shootingdb['Shooter Name'].tolist()
incidents=NYT['Incident Name'].tolist()
location=shootingdb['Location'].tolist()
city=shootingdb['City'].tolist()
state=shootingdb['State'].tolist()
title=shootingdb['Title'].tolist()
month =['jan','feb', 'mar', 'apr','june','july','aug', 'sep','oct','nov','dec']

In [None]:
dbstops= [names + incidents + location + city + state + title + month]
dbstops[:10]

In [None]:
dbstop=list(itertools.chain(*dbstops))
dbstop = [i.split(" ") for i in dbstop]
dbstop = [item for sublist in dbstop for item in sublist]
stop=stopwords.words('english')
stop += dbstop

### Cleaning Dataframe

In [None]:
def remove(NYT):
    # Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", NYT) 
    # Convert to lower case, split into individual words
    words = letters_only.lower().split()                                              
    #Remove stop words
    meaningful_words = [i for i in words if not i in stop]   
    # Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))

In [None]:
NYT['Title']=NYT['Title'].apply(remove)

In [None]:
NYT['Article']=NYT['Article'].apply(remove)

In [None]:
NYT.reset_index(drop=True, inplace=True)

In [None]:
NYT.head()

##### Base Accuracy

### CountVectorizer

### Model